diff --git a/doc/source/rllib/doc_code/catalog_guide.py b/doc/source/rllib/doc_code/catalog_guide.py index 1a92fccb83e1..6a9a5ef1f083 100644 --- a/doc/source/rllib/doc_code/catalog_guide.py +++ b/doc/source/rllib/doc_code/catalog_guide.py @@ -113,7 +113,7 @@ def __init__(self, *args, **kwargs): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .framework("torch") ) diff --git a/doc/source/rllib/doc_code/new_api_stack.py b/doc/source/rllib/doc_code/new_api_stack.py index 597922bb48df..fbe485c5b665 100644 --- a/doc/source/rllib/doc_code/new_api_stack.py +++ b/doc/source/rllib/doc_code/new_api_stack.py @@ -1,22 +1,19 @@ # __enabling-new-api-stack-sa-ppo-begin__ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( PPOConfig().environment("CartPole-v1") - # Switch the new API stack flag to True (False by default). - # This enables the use of the RLModule (replaces ModelV2) AND Learner (replaces - # Policy) classes. - .experimental(_enable_new_api_stack=True) - # However, the above flag only activates the RLModule and Learner APIs. In order - # to utilize all of the new API stack's classes, you also have to specify the - # EnvRunner (replaces RolloutWorker) to use. - # Note that this step will be fully automated in the next release. - # Set the `env_runner_cls` to `SingleAgentEnvRunner` for single-agent setups and - # `MultiAgentEnvRunner` for multi-agent cases. - .env_runners(env_runner_cls=SingleAgentEnvRunner) + # Switch both the new API stack flags to True (both False by default). + # This enables the use of + # a) RLModule (replaces ModelV2) and Learner (replaces Policy) + # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) + # and enables ConnectorV2 support. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and @@ -43,7 +40,6 @@ # __enabling-new-api-stack-ma-ppo-begin__ from ray.rllib.algorithms.ppo import PPOConfig # noqa -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner # noqa from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole # noqa @@ -51,17 +47,15 @@ # looks like this. config = ( PPOConfig().environment(MultiAgentCartPole, env_config={"num_agents": 2}) - # Switch the new API stack flag to True (False by default). - # This enables the use of the RLModule (replaces ModelV2) AND Learner (replaces - # Policy) classes. - .experimental(_enable_new_api_stack=True) - # However, the above flag only activates the RLModule and Learner APIs. In order - # to utilize all of the new API stack's classes, you also have to specify the - # EnvRunner (replaces RolloutWorker) to use. - # Note that this step will be fully automated in the next release. - # Set the `env_runner_cls` to `SingleAgentEnvRunner` for single-agent setups and - # `MultiAgentEnvRunner` for multi-agent cases. - .env_runners(env_runner_cls=MultiAgentEnvRunner) + # Switch both the new API stack flags to True (both False by default). + # This enables the use of + # a) RLModule (replaces ModelV2) and Learner (replaces Policy) + # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) + # and enables ConnectorV2 support. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and @@ -95,20 +89,19 @@ # __enabling-new-api-stack-sa-sac-begin__ from ray.rllib.algorithms.sac import SACConfig # noqa -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner # noqa config = ( SACConfig().environment("Pendulum-v1") - # Switch the new API stack flag to True (False by default). - # This enables the use of the RLModule (replaces ModelV2) AND Learner (replaces - # Policy) classes. - .experimental(_enable_new_api_stack=True) - # However, the above flag only activates the RLModule and Learner APIs. In order - # to utilize all of the new API stack's classes, you also have to specify the - # EnvRunner (replaces RolloutWorker) to use. - # Note that this step will be fully automated in the next release. - .env_runners(env_runner_cls=SingleAgentEnvRunner) + # Switch both the new API stack flags to True (both False by default). + # This enables the use of + # a) RLModule (replaces ModelV2) and Learner (replaces Policy) + # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) + # and enables ConnectorV2 support. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and diff --git a/doc/source/rllib/doc_code/rlmodule_guide.py b/doc/source/rllib/doc_code/rlmodule_guide.py index f00388818413..4a2342b3b10e 100644 --- a/doc/source/rllib/doc_code/rlmodule_guide.py +++ b/doc/source/rllib/doc_code/rlmodule_guide.py @@ -12,7 +12,7 @@ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework("torch") .environment("CartPole-v1") ) @@ -80,7 +80,7 @@ config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .rl_module( model_config_dict={"fcnet_hiddens": [32, 32]}, @@ -103,7 +103,7 @@ config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment(MultiAgentCartPole, env_config={"num_agents": 2}) .rl_module( model_config_dict={"fcnet_hiddens": [32, 32]}, @@ -406,7 +406,7 @@ def setup(self): config = ( PPOConfig() # Enable the new API stack (RLModule and Learner APIs). - .experimental(_enable_new_api_stack=True).environment("CartPole-v1") + .api_stack(enable_rl_module_and_learner=True).environment("CartPole-v1") ) env = gym.make("CartPole-v1") # Create an RL Module that we would like to checkpoint diff --git a/doc/source/rllib/rllib-learner.rst b/doc/source/rllib/rllib-learner.rst index 8900783235f1..d9c4e846b226 100644 --- a/doc/source/rllib/rllib-learner.rst +++ b/doc/source/rllib/rllib-learner.rst @@ -58,7 +58,7 @@ arguments in the :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConf config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .resources( num_gpus_per_learner_worker=0, # Set this to 1 to enable GPU training. num_cpus_per_learner_worker=1, @@ -77,7 +77,7 @@ arguments in the :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConf .. note:: This features is in alpha. If you migrate to this algorithm, enable the feature by - via `AlgorithmConfig.experimental(_enable_new_api_stack=True)`. + via `AlgorithmConfig.api_stack(enable_rl_module_and_learner=True)`. The following algorithms support :py:class:`~ray.rllib.core.learner.learner.Learner` out of the box. Implement an algorithm with a custom :py:class:`~ray.rllib.core.learner.learner.Learner` to leverage this API for other algorithms. diff --git a/doc/source/rllib/rllib-rlmodule.rst b/doc/source/rllib/rllib-rlmodule.rst index ece872bc6188..ff6d16a3626d 100644 --- a/doc/source/rllib/rllib-rlmodule.rst +++ b/doc/source/rllib/rllib-rlmodule.rst @@ -64,7 +64,7 @@ RL Module is a neural network container that implements three public methods: :p Enabling RL Modules in the Configuration ---------------------------------------- -Enable RL Modules via our configuration object: ``AlgorithmConfig.experimental(_enable_new_api_stack=True)``. +Enable RL Modules via our configuration object: ``AlgorithmConfig.api_stack(enable_rl_module_and_learner=True)``. .. literalinclude:: doc_code/rlmodule_guide.py :language: python diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index 52915edbbc45..d6a2c1e8f249 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -220,7 +220,7 @@ These functions return values for each worker as a list. You can also access just the "master" copy of the algorithm state through ``Algorithm.get_policy()`` or ``Algorithm.workers.local_worker()``, but note that updates here may not be immediately reflected in -your rollout workers (if you have configured ``num_rollout_workers > 0``). +your rollout workers (if you have configured ``num_env_runners > 0``). Here's a quick example of how to access state of a model: .. literalinclude:: ./doc_code/getting_started.py diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 30ca97a47456..53d0a7b3ed4b 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -734,7 +734,7 @@ def setup(self, config: AlgorithmConfig) -> None: method_config["type"] = method_type self.learner_group = None - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: local_worker = self.workers.local_worker() env = spaces = None # EnvRunners have a `module` property, which stores the RLModule @@ -784,7 +784,7 @@ def setup(self, config: AlgorithmConfig) -> None: # Note that with the new EnvRunner API in combination with the new stack, # this information only needs to be kept in the Learner and not on the # EnvRunners anymore. - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: policies_to_train = self.config.policies_to_train or set( self.config.policies ) @@ -795,7 +795,7 @@ def setup(self, config: AlgorithmConfig) -> None: # Sync the weights from the learner group to the rollout workers. weights = self.learner_group.get_weights( - inference_only=self.config.uses_new_env_runners + inference_only=self.config.enable_env_runner_and_connector_v2 ) local_worker.set_weights(weights) self.workers.sync_weights(inference_only=True) @@ -901,7 +901,7 @@ def step(self) -> ResultDict: # references). Then distribute the episode refs to the learners, store metrics # in special key in result dict and perform the connector merge/broadcast # inside the `training_step` as well. See the new IMPALA for an example. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Synchronize EnvToModule and ModuleToEnv connector states and broadcast new # states back to all EnvRunners. with self.metrics.log_time((TIMERS, SYNCH_ENV_CONNECTOR_STATES_TIMER)): @@ -986,7 +986,7 @@ def evaluate( inference_only=True, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Synchronize EnvToModule and ModuleToEnv connector states and broadcast # new states back to all eval EnvRunners. with self._timers[SYNCH_EVAL_ENV_CONNECTOR_STATES_TIMER]: @@ -1053,7 +1053,7 @@ def evaluate( else: pass - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Lifetime eval counters. self.metrics.log_dict( { @@ -1148,7 +1148,7 @@ def _evaluate_on_local_env_runner(self, env_runner): logger.info(f"Evaluating current state of {self} for {duration} {unit}.") all_batches = [] - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: episodes = env_runner.sample( num_timesteps=duration if unit == "timesteps" else None, num_episodes=duration if unit == "episodes" else None, @@ -1171,7 +1171,7 @@ def _evaluate_on_local_env_runner(self, env_runner): env_runner_results = env_runner.get_metrics() - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: env_runner_results = summarize_episodes( env_runner_results, env_runner_results, @@ -1230,7 +1230,7 @@ def _env_runner_remote(worker, num, round, iter): ): _round += 1 # New API stack -> EnvRunners return Episodes. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Compute rough number of timesteps it takes for a single EnvRunner # to occupy the estimated (parallelly running) train step. _num = min( @@ -1299,7 +1299,7 @@ def _env_runner_remote(worker, num, round, iter): "recreate_failed_env_runners=True)` setting." ) - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: env_runner_results = summarize_episodes( all_metrics, all_metrics, @@ -1388,7 +1388,7 @@ def _env_runner_remote(worker, num, round, iter): _round += 1 # New API stack -> EnvRunners return Episodes. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: _num = [None] + [ # [None]: skip idx=0 (local worker) (units_left_to_do // num_healthy_workers) + bool(i <= (units_left_to_do % num_healthy_workers)) @@ -1428,7 +1428,8 @@ def _env_runner_remote(worker, num, round, iter): units_per_healthy_remote_worker = ( 1 if unit == "episodes" - else eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_worker + else eval_cfg.rollout_fragment_length + * eval_cfg.num_envs_per_env_runner ) # Select proper number of evaluation workers for this round. selected_eval_worker_ids = [ @@ -1487,7 +1488,7 @@ def _env_runner_remote(worker, num, round, iter): "recreate_failed_env_runners=True)` setting." ) - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: env_runner_results = summarize_episodes( all_metrics, all_metrics, @@ -1605,7 +1606,7 @@ def training_step(self) -> ResultDict: Returns: The results dict from executing the training iteration. """ - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: raise NotImplementedError( "The `Algorithm.training_step()` default implementation no longer " "supports the old or hybrid API stacks! If you would like to continue " @@ -1622,7 +1623,9 @@ def training_step(self) -> ResultDict: worker_set=self.workers, max_agent_steps=self.config.train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) else: @@ -1630,7 +1633,9 @@ def training_step(self) -> ResultDict: worker_set=self.workers, max_env_steps=self.config.train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) train_batch = train_batch.as_multi_agent() @@ -2097,7 +2102,7 @@ def add_policy( The newly added policy (the copy that got added to the local worker). If `workers` was provided, None is returned. """ - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: raise ValueError( "`Algorithm.add_policy()` is not supported on the new API stack w/ " "EnvRunners! Use `Algorithm.add_module()` instead. Also see " @@ -2122,7 +2127,7 @@ def add_policy( # If learner API is enabled, we need to also add the underlying module # to the learner group. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: policy = self.get_policy(policy_id) module = policy.model self.learner_group.add_module( @@ -2278,7 +2283,7 @@ def fn(worker): # Update each Learner's `policies_to_train` information, but only # if the arg is explicitly provided here. - if self.config._enable_new_api_stack and policies_to_train is not None: + if self.config.enable_rl_module_and_learner and policies_to_train is not None: self.learner_group.foreach_learner( lambda learner: learner.config.multi_agent( policies_to_train=policies_to_train @@ -2391,7 +2396,7 @@ def save_checkpoint(self, checkpoint_dir: str) -> None: policy_states = state["worker"].pop("policy_states", {}) # Add RLlib checkpoint version. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: state["checkpoint_version"] = CHECKPOINT_VERSION_LEARNER else: state["checkpoint_version"] = CHECKPOINT_VERSION @@ -2426,7 +2431,7 @@ def save_checkpoint(self, checkpoint_dir: str) -> None: policy.export_checkpoint(policy_dir, policy_state=policy_state) # if we are using the learner API, save the learner group state - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: learner_state_dir = os.path.join(checkpoint_dir, "learner") self.learner_group.save_state(learner_state_dir) @@ -2438,12 +2443,12 @@ def load_checkpoint(self, checkpoint_dir: str) -> None: checkpoint_info = get_checkpoint_info(checkpoint_dir) checkpoint_data = Algorithm._checkpoint_info_to_algorithm_state(checkpoint_info) self.__setstate__(checkpoint_data) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: learner_state_dir = os.path.join(checkpoint_dir, "learner") self.learner_group.load_state(learner_state_dir) # Make also sure, all training EnvRunners get the just loaded weights. weights = self.learner_group.get_weights( - inference_only=self.config.uses_new_env_runners + inference_only=self.config.enable_env_runner_and_connector_v2 ) self.workers.local_worker().set_weights(weights) self.workers.sync_weights(inference_only=True) @@ -2492,7 +2497,7 @@ def default_resource_request( eval_cf.freeze() # resources for the driver of this trainable - if cf._enable_new_api_stack: + if cf.enable_rl_module_and_learner: if cf.num_learner_workers == 0: # in this case local_worker only does sampling and training is done on # local learner worker @@ -2547,7 +2552,7 @@ def default_resource_request( # resources for remote learner workers learner_bundles = [] - if cf._enable_new_api_stack and cf.num_learner_workers > 0: + if cf.enable_rl_module_and_learner and cf.num_learner_workers > 0: learner_bundles = cls._get_learner_bundles(cf) bundles = [driver] + rollout_bundles + evaluation_bundles + learner_bundles @@ -2692,7 +2697,7 @@ def resource_help(cls, config: Union[AlgorithmConfig, AlgorithmConfigDict]) -> s "\n\nYou can adjust the resource requests of RLlib Algorithms by calling " "`AlgorithmConfig.resources(" "num_gpus=.., num_cpus_per_worker=.., num_gpus_per_worker=.., ..)` or " - "`AgorithmConfig.env_runners(num_rollout_workers=..)`. See " + "`AgorithmConfig.env_runners(num_env_runners=..)`. See " "the `ray.rllib.algorithms.algorithm_config.AlgorithmConfig` classes " "(each Algorithm has its own subclass of this class) for more info.\n\n" f"The config of this Algorithm is: {config}" @@ -2852,7 +2857,7 @@ def __getstate__(self) -> Dict: if ( hasattr(self, "evaluation_workers") and self.evaluation_workers is not None - and not self.config.uses_new_env_runners + and not self.config.enable_env_runner_and_connector_v2 ): state[ "eval_policy_mapping_fn" @@ -2866,7 +2871,7 @@ def __getstate__(self) -> Dict: state["local_replay_buffer"] = self.local_replay_buffer.get_state() # New API stack: Save entire MetricsLogger state. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: state["metrics_logger"] = self.metrics.get_state() # Old API stack: Save only counters. else: @@ -2934,12 +2939,12 @@ def _setup_eval_worker(w): "data found in state!" ) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if "learner_state_dir" in state: self.learner_group.load_state(state["learner_state_dir"]) else: logger.warning( - "You configured `_enable_new_api_stack=True`, but no " + "You configured `enable_rl_module_and_learner=True`, but no " "`learner_state_dir` key could be found in the state dict!" ) # Recover MetricsLogger state. @@ -3098,7 +3103,7 @@ def _checkpoint_info_to_algorithm_state( ): worker_state["is_policy_to_train"] = policies_to_train - if state["config"]._enable_new_api_stack: + if state["config"].enable_rl_module_and_learner: state["learner_state_dir"] = os.path.join( checkpoint_info["checkpoint_dir"], "learner" ) @@ -3207,7 +3212,7 @@ def _run_one_evaluation( # To make the old stack forward compatible with the new API stack metrics # structure, we add everything under the new key (EVALUATION_RESULTS) as well as # the old one ("evaluation"). - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: self.evaluation_metrics["evaluation"] = eval_results return self.evaluation_metrics @@ -3287,7 +3292,7 @@ def _run_one_training_iteration_and_evaluation_in_parallel_wo_thread( # In case all the remote evaluation workers die during a round of # evaluation, we need to stop. units_per_healthy_remote_worker = ( - eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_worker + eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_env_runner ) # Select proper number of evaluation workers for this round. selected_eval_worker_ids = [ @@ -3314,7 +3319,7 @@ def _run_one_training_iteration_and_evaluation_in_parallel_wo_thread( env_steps += batch.env_steps() all_metrics.append(metrics) - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: eval_results = summarize_episodes( all_metrics, all_metrics, @@ -3599,7 +3604,7 @@ def __enter__(self): self.time_start = time.time() self.sampled = 0 self.trained = 0 - if self.algo.config.uses_new_env_runners: + if self.algo.config.enable_env_runner_and_connector_v2: self.init_env_steps_sampled = self.algo.metrics.peek( NUM_ENV_STEPS_SAMPLED_LIFETIME ) @@ -3645,7 +3650,7 @@ def should_stop(self, results): return False # Stopping criteria. - if self.algo.config.uses_new_env_runners: + if self.algo.config.enable_env_runner_and_connector_v2: if self.algo.config.count_steps_by == "agent_steps": self.sampled = ( sum( diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index d16d9c68bf11..58fde3ec5dc3 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -33,7 +33,6 @@ from ray.rllib.env.wrappers.atari_wrappers import is_atari from ray.rllib.evaluation.collectors.sample_collector import SampleCollector from ray.rllib.evaluation.collectors.simple_list_collector import SimpleListCollector -from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.models import MODEL_DEFAULTS from ray.rllib.policy.policy import Policy, PolicySpec from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID @@ -319,6 +318,10 @@ def __init__(self, algo_class: Optional[type] = None): ) self.torch_compile_worker_dynamo_mode = None + # `self.api_stack()` + self.enable_rl_module_and_learner = False + self.enable_env_runner_and_connector_v2 = False + # `self.environment()` self.env = None self.env_config = {} @@ -513,7 +516,6 @@ def __init__(self, algo_class: Optional[type] = None): self.__prior_exploration_config = None # `self.experimental()` - self._enable_new_api_stack = False self._tf_policy_handles_more_than_one_loss = False self._disable_preprocessor_api = False self._disable_action_flattening = False @@ -525,7 +527,6 @@ def __init__(self, algo_class: Optional[type] = None): # TODO: Remove, once all deprecation_warning calls upon using these keys # have been removed. # === Deprecated keys === - self.evaluation_num_workers = DEPRECATED_VALUE self.simple_optimizer = DEPRECATED_VALUE self.monitor = DEPRECATED_VALUE self.evaluation_num_episodes = DEPRECATED_VALUE @@ -631,6 +632,7 @@ def to_dict(self) -> AlgorithmConfigDict: "min_train_timesteps_per_reporting", "min_sample_timesteps_per_reporting", "input_evaluation", + "_enable_new_api_stack", ]: if config.get(dep_k) == DEPRECATED_VALUE: config.pop(dep_k, None) @@ -663,10 +665,12 @@ def update_from_dict( # Namely, we want to re-instantiate the exploration config this config had # inside `self.experimental()` before potentially overwriting it in the # following. - if "_enable_new_api_stack" in config_dict: - self.experimental( - _enable_new_api_stack=config_dict["_enable_new_api_stack"] - ) + enable_rl_module_and_learner = config_dict.get( + "_enable_new_api_stack", + config_dict.get("enable_rl_module_and_learner"), + ) + if enable_rl_module_and_learner: + self.api_stack(enable_rl_module_and_learner=enable_rl_module_and_learner) # Modify our properties one by one. for key, value in config_dict.items(): @@ -677,7 +681,7 @@ def update_from_dict( if key == TRIAL_INFO: continue - if key == "_enable_new_api_stack": + if key in ["_enable_new_api_stack", "enable_rl_module_and_learner"]: # We've dealt with this above. continue # Set our multi-agent settings. @@ -710,7 +714,7 @@ def update_from_dict( elif key.startswith("evaluation_"): eval_call[key] = value elif key == "exploration_config": - if config_dict.get("_enable_new_api_stack", False): + if enable_rl_module_and_learner: self.exploration_config = value continue if isinstance(value, dict) and "type" in value: @@ -1401,6 +1405,49 @@ def framework( return self + def api_stack( + self, + enable_rl_module_and_learner: Optional[str] = NotProvided, + enable_env_runner_and_connector_v2: Optional[str] = NotProvided, + ) -> "AlgorithmConfig": + """Sets the config's API stack settings. + + Args: + enable_rl_module_and_learner: Enables the usage of `RLModule` (instead of + `ModelV2`) and Learner (instead of the training-related parts of + `Policy`). If `enable_env_runner_and_connector_v2=False`, these two + classes (`RLModule` and `Learner`) will be used along with + `RolloutWorkers` and `Policy`. + enable_env_runner_and_connector_v2: Enables the usage of EnvRunners + (SingleAgentEnvRunner and MultiAgentEnvRunner) and ConnectorV2. + When setting this to True, `enable_rl_module_and_learner` must be True + as well. + + Returns: + This updated AlgorithmConfig object. + """ + if enable_rl_module_and_learner is not NotProvided: + self.enable_rl_module_and_learner = enable_rl_module_and_learner + + if enable_rl_module_and_learner is True and self.exploration_config: + self.__prior_exploration_config = self.exploration_config + self.exploration_config = {} + + elif enable_rl_module_and_learner is False and not self.exploration_config: + if self.__prior_exploration_config is not None: + self.exploration_config = self.__prior_exploration_config + self.__prior_exploration_config = None + else: + logger.warning( + "config.enable_rl_module_and_learner was set to False, but no " + "prior exploration config was found to be restored." + ) + + if enable_env_runner_and_connector_v2 is not NotProvided: + self.enable_env_runner_and_connector_v2 = enable_env_runner_and_connector_v2 + + return self + def environment( self, env: Optional[Union[str, EnvType]] = NotProvided, @@ -1627,9 +1674,9 @@ def env_runners( 1. RLlib collects 10 fragments of 100 steps each from rollout workers. 2. These fragments are concatenated and we perform an epoch of SGD. When using multiple envs per worker, the fragment size is multiplied by - `num_envs_per_worker`. This is since we are collecting steps from - multiple envs in parallel. For example, if num_envs_per_worker=5, then - EnvRunners will return experiences in chunks of 5*100 = 500 steps. + `num_envs_per_env_runner`. This is since we are collecting steps from + multiple envs in parallel. For example, if num_envs_per_env_runner=5, + then EnvRunners will return experiences in chunks of 5*100 = 500 steps. The dataflow here can vary per algorithm. For example, PPO further divides the train batch into minibatches for multi-epoch SGD. Set `rollout_fragment_length` to "auto" to have RLlib compute an exact @@ -1640,7 +1687,7 @@ def env_runners( env- or agent-steps) and depends on the `count_steps_by` setting, adjustable via `AlgorithmConfig.multi_agent(count_steps_by=..)`: 1) "truncate_episodes": Each call to `EnvRunner.sample()` will return a - batch of at most `rollout_fragment_length * num_envs_per_worker` in + batch of at most `rollout_fragment_length * num_envs_per_env_runner` in size. The batch will be exactly `rollout_fragment_length * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. @@ -1648,17 +1695,17 @@ def env_runners( variance as the future return must now be estimated at truncation boundaries. 2) "complete_episodes": Each call to `EnvRunner.sample()` will return a - batch of at least `rollout_fragment_length * num_envs_per_worker` in + batch of at least `rollout_fragment_length * num_envs_per_env_runner` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the (minimum) batch size. - Note that when `num_envs_per_worker > 1`, episode steps will be buffered - until the episode completes, and hence batches may contain + Note that when `num_envs_per_env_runner > 1`, episode steps will be + buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. explore: Default exploration behavior, iff `explore=None` is passed into compute_action(s). Set to False for no exploration behavior (e.g., for evaluation). exploration_config: A dict specifying the Exploration object's config. - remote_worker_envs: If using num_envs_per_worker > 1, whether to create + remote_worker_envs: If using num_envs_per_env_runner > 1, whether to create those new envs in remote processes instead of in the same worker. This adds overheads, but can make sense if your envs can take much time to step / reset (e.g., for StarCraft). Use this cautiously; @@ -1926,7 +1973,7 @@ def training( full list of the available model options. TODO: Provide ModelConfig objects instead of dicts. optimizer: Arguments to pass to the policy optimizer. This setting is not - used when `_enable_new_api_stack=True`. + used when `enable_rl_module_and_learner=True`. max_requests_in_flight_per_sampler_worker: Max number of inflight requests to each sampling worker. See the FaultTolerantActorManager class for more details. @@ -1940,7 +1987,7 @@ def training( turn down the number of remote requests in flight, or enable compression in your experiment of timesteps. learner_class: The `Learner` class to use for (distributed) updating of the - RLModule. Only used when `_enable_new_api_stack=True`. + RLModule. Only used when `enable_rl_module_and_learner=True`. learner_connector: A callable taking an env observation space and an env action space as inputs and returning a learner ConnectorV2 (might be a pipeline) object. @@ -1989,8 +2036,8 @@ def training( deprecation_warning( old="AlgorithmConfig.training(_use_default_native_models=True)", help="_use_default_native_models is not supported " - "anymore. To get rid of this error, set `config.experimental(" - "_enable_new_api_stack=True)`. Native models will " + "anymore. To get rid of this error, set `config.api_stack(" + "enable_rl_module_and_learner=True)`. Native models will " "be better supported by the upcoming RLModule API.", # Error out if user tries to enable this. error=model["_use_default_native_models"], @@ -2105,8 +2152,9 @@ def evaluation( (default) will make sure that the evaluation results will not be polluted with episode statistics that were actually (at least partially) achieved with an earlier set of weights. Note that this setting is only - supported on the new API stack (`config._enable_new_api_stack=True` - and `config.env_runner_cls=[SingleAgentEnvRunner|MultiAgentEnvrunner]`). + supported on the new API stack w/ EnvRunners and ConnectorV2 + (`config.enable_rl_module_and_learner=True` AND + `config.enable_env_runner_and_connector_v2=True`). evaluation_config: Typical usage is to pass extra args to evaluation env creator and to disable exploration by computing deterministic actions. IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal @@ -2368,7 +2416,7 @@ def multi_agent( These tuples or PolicySpecs define the class of the policy, the observation- and action spaces of the policies, and any extra config. algorithm_config_overrides_per_module: Only used if - `_enable_new_api_stack=True`. + `enable_rl_module_and_learner=True`. A mapping from ModuleIDs to per-module AlgorithmConfig override dicts, which apply certain settings, e.g. the learning rate, from the main AlgorithmConfig only to this @@ -2701,8 +2749,8 @@ def fault_tolerance( delay_between_env_runner_restarts_s: Optional[float] = NotProvided, restart_failed_sub_environments: Optional[bool] = NotProvided, num_consecutive_env_runner_failures_tolerance: Optional[int] = NotProvided, - env_runner_health_probe_timeout_s: int = NotProvided, - env_runner_restore_timeout_s: int = NotProvided, + env_runner_health_probe_timeout_s: Optional[float] = NotProvided, + env_runner_restore_timeout_s: Optional[float] = NotProvided, # Deprecated args. ignore_worker_failures=DEPRECATED_VALUE, recreate_failed_workers=DEPRECATED_VALUE, @@ -2742,9 +2790,9 @@ def fault_tolerance( failures, the EnvRunner itself is NOT affected and won't throw any errors as the flawed sub-environment is silently restarted under the hood. - env_runner_health_probe_timeout_s: Max amount of time we should spend - waiting for health probe calls to finish. Health pings are very cheap, - so the default is 1 minute. + env_runner_health_probe_timeout_s: Max amount of time (in seconds) we should + spend waiting for health probe calls to finish. Health pings are very + cheap, so the default is 1 minute. env_runner_restore_timeout_s: Max amount of time we should wait to restore states on recovered EnvRunner actors. Default is 30 mins. @@ -2851,8 +2899,8 @@ def rl_module( if _enable_rl_module_api is not NotProvided: deprecation_warning( - old="AlgorithmConfig.rl_module(_enable_rl_module_api=True|False)", - new="AlgorithmConfig.experimental(_enable_new_api_stack=True|False)", + old="AlgorithmConfig.rl_module(_enable_rl_module_api=..)", + new="AlgorithmConfig.api_stack(enable_rl_module_and_learner=..)", error=False, ) return self @@ -2860,20 +2908,16 @@ def rl_module( def experimental( self, *, - _enable_new_api_stack: Optional[bool] = NotProvided, _tf_policy_handles_more_than_one_loss: Optional[bool] = NotProvided, _disable_preprocessor_api: Optional[bool] = NotProvided, _disable_action_flattening: Optional[bool] = NotProvided, _disable_initialize_loss_from_dummy_batch: Optional[bool] = NotProvided, # Deprecated args. - _disable_execution_plan_api=None, + _enable_new_api_stack=DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Sets the config's experimental settings. Args: - _enable_new_api_stack: Enables the new API stack, which will use RLModule - (instead of ModelV2) as well as the multi-GPU capable Learner API - (instead of using Policy to compute loss and update the model). _tf_policy_handles_more_than_one_loss: Experimental flag. If True, TFPolicy will handle more than one loss/optimizer. Set this to True, if you would like to return more than @@ -2895,31 +2939,13 @@ def experimental( Returns: This updated AlgorithmConfig object. """ - if _disable_execution_plan_api is not None: + if _enable_new_api_stack != DEPRECATED_VALUE: deprecation_warning( - old="config.experimental(_disable_execution_plan_api=...)", - help="The execution plan API is no longer supported! Use subclassing " - "of the `Algorithm` class and override the " - "`Algorithm.training_step()` method instead.", - error=True, + old="config.experimental(_enable_new_api_stack=...)", + new="config.api_stack(enable_rl_module_and_learner=...)", + error=False, ) - - if _enable_new_api_stack is not NotProvided: - self._enable_new_api_stack = _enable_new_api_stack - - if _enable_new_api_stack is True and self.exploration_config: - self.__prior_exploration_config = self.exploration_config - self.exploration_config = {} - - elif _enable_new_api_stack is False and not self.exploration_config: - if self.__prior_exploration_config is not None: - self.exploration_config = self.__prior_exploration_config - self.__prior_exploration_config = None - else: - logger.warning( - "config._enable_new_api_stack was set to False, but no prior " - "exploration config was found to be restored." - ) + self.api_stack(enable_rl_module_and_learner=_enable_new_api_stack) if _tf_policy_handles_more_than_one_loss is not NotProvided: self._tf_policy_handles_more_than_one_loss = ( @@ -3002,12 +3028,6 @@ def is_atari(self) -> bool: return self._is_atari - @property - def uses_new_env_runners(self): - return self.env_runner_cls is not None and not issubclass( - self.env_runner_cls, RolloutWorker - ) - @property def total_train_batch_size(self): if self.train_batch_size_per_learner is not None: @@ -3022,7 +3042,7 @@ def get_rollout_fragment_length(self, worker_index: int = 0) -> int: Uses the simple formula: `rollout_fragment_length` = `total_train_batch_size` / - (`num_envs_per_worker` * `num_env_runners`) + (`num_envs_per_env_runner` * `num_env_runners`) If result is a fraction AND `worker_index` is provided, will make those workers add additional timesteps, such that the overall batch size (across @@ -3044,13 +3064,13 @@ def get_rollout_fragment_length(self, worker_index: int = 0) -> int: # -> 512 / 40 -> 12.8 -> diff=32 (12 * 40 = 480) # -> worker 1: 13, workers 2: 12 rollout_fragment_length = self.total_train_batch_size / ( - self.num_envs_per_worker * (self.num_env_runners or 1) + self.num_envs_per_env_runner * (self.num_env_runners or 1) ) if int(rollout_fragment_length) != rollout_fragment_length: diff = self.total_train_batch_size - int( rollout_fragment_length - ) * self.num_envs_per_worker * (self.num_env_runners or 1) - if ((worker_index - 1) * self.num_envs_per_worker) >= diff: + ) * self.num_envs_per_env_runner * (self.num_env_runners or 1) + if ((worker_index - 1) * self.num_envs_per_env_runner) >= diff: return int(rollout_fragment_length) else: return int(rollout_fragment_length) + 1 @@ -3387,7 +3407,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: dependent on rollout_fragment_length (synchronous sampling, on-policy PG algos). If rollout_fragment_length != "auto", makes sure that the product of - `rollout_fragment_length` x `num_env_runners` x `num_envs_per_worker` + `rollout_fragment_length` x `num_env_runners` x `num_envs_per_env_runner` roughly (10%) matches the provided `train_batch_size`. Otherwise, errors with asking the user to set rollout_fragment_length to `auto` or to a matching value. @@ -3406,7 +3426,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: ): min_batch_size = ( max(self.num_env_runners, 1) - * self.num_envs_per_worker + * self.num_envs_per_env_runner * self.rollout_fragment_length ) batch_size = min_batch_size @@ -3418,7 +3438,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: 0.1 * self.total_train_batch_size ): suggested_rollout_fragment_length = self.total_train_batch_size // ( - self.num_envs_per_worker * (self.num_env_runners or 1) + self.num_envs_per_env_runner * (self.num_env_runners or 1) ) raise ValueError( "Your desired `total_train_batch_size` " @@ -3426,7 +3446,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: f"learners x {self.train_batch_size_per_learner}) " "or a value 10% off of that cannot be achieved with your other " f"settings (num_env_runners={self.num_env_runners}; " - f"num_envs_per_worker={self.num_envs_per_worker}; " + f"num_envs_per_env_runner={self.num_envs_per_env_runner}; " f"rollout_fragment_length={self.rollout_fragment_length})! " "Try setting `rollout_fragment_length` to 'auto' OR to a value of " f"{suggested_rollout_fragment_length}." @@ -3851,11 +3871,11 @@ def _validate_framework_settings(self) -> None: _torch, _ = try_import_torch() # Can not use "tf" with learner API. - if self.framework_str == "tf" and self._enable_new_api_stack: + if self.framework_str == "tf" and self.enable_rl_module_and_learner: raise ValueError( "Cannot use `framework=tf` with the new API stack! Either switch to tf2" " via `config.framework('tf2')` OR disable the new API stack via " - "`config.experimental(_enable_new_api_stack=False)`." + "`config.api_stack(enable_rl_module_and_learner=False)`." ) # Check if torch framework supports torch.compile. @@ -3931,13 +3951,14 @@ def _validate_multi_agent_settings(self): # multi-agent. if ( self.is_multi_agent() - and self.uses_new_env_runners - and self.num_envs_per_worker > 1 + and self.enable_env_runner_and_connector_v2 + and self.num_envs_per_env_runner > 1 ): raise ValueError( - "For now, using env vectorization (`config.num_envs_per_worker > 1`) " - "in combination with multi-agent AND the new EnvRunners is not " - "supported! Try setting `config.num_envs_per_worker = 1`." + "For now, using env vectorization " + "(`config.num_envs_per_env_runner > 1`) in combination with " + "multi-agent AND the new EnvRunners is not supported! Try setting " + "`config.num_envs_per_env_runner = 1`." ) def _validate_evaluation_settings(self): @@ -4033,14 +4054,14 @@ def _validate_input_settings(self): def _validate_new_api_stack_settings(self): """Checks, whether settings related to the new API stack make sense.""" - if not self._enable_new_api_stack: + if not self.enable_rl_module_and_learner: # Throw a warning if the user has used `self.rl_module(rl_module_spec=...)` # but has not enabled the new API stack at the same time. if self._rl_module_spec is not None: logger.warning( "You have setup a RLModuleSpec (via calling " "`config.rl_module(...)`), but have not enabled the new API stack. " - "To enable it, call `config.experimental(_enable_new_api_stack=" + "To enable it, call `config.api_stack(enable_rl_module_and_learner=" "True)`." ) # Throw a warning if the user has used `self.training(learner_class=...)` @@ -4050,18 +4071,19 @@ def _validate_new_api_stack_settings(self): "You specified a custom Learner class (via " f"`AlgorithmConfig.training(learner_class={self._learner_class})`, " f"but have the new API stack disabled. You need to enable it via " - "`AlgorithmConfig.experimental(_enable_new_api_stack=True)`." + "`AlgorithmConfig.api_stack(enable_rl_module_and_learner=True)`." ) # User is using the new EnvRunners, but forgot to switch on - # `_enable_new_api_stack`. - if self.uses_new_env_runners: + # `enable_rl_module_and_learner`. + if self.enable_env_runner_and_connector_v2: raise ValueError( "You are using the new API stack EnvRunners (SingleAgentEnvRunner " "or MultiAgentEnvRunner), but have forgotten to switch on the new " "API stack! Try setting " - "`config.experimental(_enable_new_api_stack=True)`." + "`config.api_stack(enable_rl_module_and_learner=True)`." ) - # Early out. The rest of this method is only for _enable_new_api_stack=True. + # Early out. The rest of this method is only for + # `enable_rl_module_and_learner=True`. return # New API stack (RLModule, Learner APIs) only works with connectors. @@ -4084,7 +4106,7 @@ def _validate_new_api_stack_settings(self): # gym.vector.Env yet and therefore the reset call is still made manually, # allowing for the callback to be fired). if ( - self.uses_new_env_runners + self.enable_env_runner_and_connector_v2 and not self.is_multi_agent() and self.callbacks_class is not DefaultCallbacks ): @@ -4121,8 +4143,8 @@ def _validate_new_api_stack_settings(self): "Cannot use `{}` option with the new API stack (RLModule and " "Learner APIs)! `{}` is part of the ModelV2 API and Policy API," " which are not compatible with the new API stack. You can either " - "deactivate the new stack via `config.experimental( " - "_enable_new_api_stack=False)`," + "deactivate the new stack via `config.api_stack( " + "enable_rl_module_and_learner=False)`," "or use the new stack (incl. RLModule API) and implement your " "custom model as an RLModule." ) @@ -4142,8 +4164,8 @@ def _validate_new_api_stack_settings(self): # TODO (sven): Once everything is on the new API stack, we won't need this method # anymore. def _validate_to_be_deprecated_settings(self): - # `env_task_fn` will be deprecated. - if self._enable_new_api_stack and self.env_task_fn is not None: + # Env task fn will be deprecated. + if self.enable_rl_module_and_learner and self.env_task_fn is not None: deprecation_warning( old="AlgorithmConfig.env_task_fn", help="The `env_task_fn` API is not supported on the new API stack! " @@ -4193,7 +4215,7 @@ def _validate_to_be_deprecated_settings(self): if self.simple_optimizer is True: pass # Multi-GPU setting: Must use MultiGPUTrainOneStep. - elif not self._enable_new_api_stack and self.num_gpus > 1: + elif not self.enable_rl_module_and_learner and self.num_gpus > 1: # TODO: AlphaStar uses >1 GPUs differently (1 per policy actor), so this is # ok for tf2 here. # Remove this hacky check, once we have fully moved to the Learner API. @@ -4437,6 +4459,25 @@ def rollouts(self, *args, **kwargs): def exploration(self, *args, **kwargs): return self.env_runners(*args, **kwargs) + @property + @Deprecated(new="AlgorithmConfig._enable_new_api_stack", error=False) + def _enable_new_api_stack(self): + return self.enable_rl_module_and_learner + + @_enable_new_api_stack.setter + def _enable_new_api_stack(self, value): + deprecation_warning( + old="AlgorithmConfig._enable_new_api_stack", + new="AlgorithmConfig.enable_rl_module_and_learner", + error=False, + ) + self.enable_rl_module_and_learner = value + + @property + @Deprecated(new="AlgorithmConfig.enable_env_runner_and_connector_v2", error=True) + def uses_new_env_runners(self): + pass + @property @Deprecated(new="AlgorithmConfig.num_env_runners", error=False) def num_rollout_workers(self): @@ -4451,6 +4492,20 @@ def num_rollout_workers(self, value): ) self.num_env_runners = value + @property + @Deprecated(new="AlgorithmConfig.evaluation_num_workers", error=False) + def evaluation_num_workers(self): + return self.evaluation_num_env_runners + + @evaluation_num_workers.setter + def evaluation_num_workers(self, value): + deprecation_warning( + old="AlgorithmConfig.evaluation_num_workers", + new="AlgorithmConfig.evaluation_num_env_runners", + error=False, + ) + self.evaluation_num_env_runners = value + @property @Deprecated(new="AlgorithmConfig.num_envs_per_env_runner", error=False) def num_envs_per_worker(self): diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 4f5caa1259f9..5228e6da3771 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -116,8 +116,8 @@ def __init__(self, algo_class=None): self.broadcast_interval = 1 self.grad_clip = 40.0 - # Note: Only when using _enable_new_api_stack=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by # global_norm, no matter the value of `grad_clip_by`. self.grad_clip_by = "global_norm" @@ -187,7 +187,7 @@ def training( networks and tuned the kl loss coefficients that are used during training. NOTE: This parameter is only applicable when using the Learner API - (_enable_new_api_stack=True). + (enable_rl_module_and_learner=True). Returns: @@ -272,7 +272,7 @@ def __init__(self, config, *args, **kwargs): # TODO(avnishn): Does this need to happen in __init__? I think we can move it # to setup() - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: self.workers.local_worker().foreach_policy_to_train( lambda p, _: p.update_target() ) @@ -290,7 +290,7 @@ def after_train_step(self, train_results: ResultDict) -> None: training step. """ - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if NUM_TARGET_UPDATES in train_results: self._counters[NUM_TARGET_UPDATES] += train_results[NUM_TARGET_UPDATES] self._counters[LAST_TARGET_UPDATE_TS] = train_results[ @@ -376,7 +376,7 @@ def get_default_policy_class( return APPOTorchPolicy elif config["framework"] == "tf": - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: raise ValueError( "RLlib's RLModule and Learner API is not supported for" " tf1. Use " diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 5f6ad462d43b..c39d09f3a989 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -86,7 +86,7 @@ def __init__( # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not config.get("_enable_new_api_stack", False): + if not config.get("enable_rl_module_and_learner", False): # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. VTraceClipGradients.__init__(self) @@ -111,7 +111,7 @@ def __init__( ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) - if not config.get("_enable_new_api_stack", False): + if not config.get("enable_rl_module_and_learner", False): GradStatsMixin.__init__(self) # Note: this is a bit ugly, but loss and optimizer initialization must diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 3337f021c3c5..c62e01941ca3 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -74,7 +74,7 @@ def __init__(self, observation_space, action_space, config): # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not config.get("_enable_new_api_stack", False): + if not config.get("enable_rl_module_and_learner", False): # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. VTraceOptimizer.__init__(self) diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py index c6f1b4d96307..a3fcf10d5612 100644 --- a/rllib/algorithms/appo/tests/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -56,7 +56,7 @@ def test_appo_loss(self): """Test that appo_policy_rlm loss matches the appo learner loss.""" config = ( appo.APPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -104,7 +104,7 @@ def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 config = ( appo.APPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") # Asynchronous Algo, make sure we have some results after 1 iteration. .reporting(min_time_s_per_iteration=10) diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index 1ff7b6fd0a74..7b460a59ae43 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -72,7 +72,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index 41c45958299b..34e2e7e15990 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -87,7 +87,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/bc/bc.py b/rllib/algorithms/bc/bc.py index 29e2693f159c..f26e061600f4 100644 --- a/rllib/algorithms/bc/bc.py +++ b/rllib/algorithms/bc/bc.py @@ -73,7 +73,7 @@ def __init__(self, algo_class=None): # not important for behavioral cloning. self.postprocess_inputs = False # Set RLModule as default. - self.experimental(_enable_new_api_stack=True) + self.api_stack(enable_rl_module_and_learner=True) # __sphinx_doc_end__ # fmt: on @@ -137,7 +137,7 @@ def get_default_config(cls) -> AlgorithmConfig: @override(MARWIL) def training_step(self) -> ResultDict: - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: # Using ModelV2. return super().training_step() else: diff --git a/rllib/algorithms/bc/tests/test_bc.py b/rllib/algorithms/bc/tests/test_bc.py index 89acd8102caa..072bba5cb712 100644 --- a/rllib/algorithms/bc/tests/test_bc.py +++ b/rllib/algorithms/bc/tests/test_bc.py @@ -48,7 +48,7 @@ def test_bc_compilation_and_learning_from_offline_file(self): # Test for RLModule API and ModelV2. for rl_modules in [True, False]: - config.experimental(_enable_new_api_stack=rl_modules) + config.api_stack(enable_rl_module_and_learner=rl_modules) # Old and new stack support different frameworks if rl_modules: frameworks_to_test = ("torch", "tf2") diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index e17b8aca2c98..7a869d950aa5 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -139,8 +139,8 @@ def __init__(self, algo_class=None): # `training()` self.grad_clip = 40.0 - # Note: Only when using _enable_new_api_stack=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by # global_norm, no matter the value of `grad_clip_by`. self.grad_clip_by = "global_norm" self.lr = 5e-4 @@ -407,7 +407,7 @@ def validate(self) -> None: super().validate() if ( - not self._enable_new_api_stack + not self.enable_rl_module_and_learner and self.exploration_config["type"] == "ParameterNoise" ): if self.batch_mode != "complete_episodes": @@ -417,7 +417,7 @@ def validate(self) -> None: "batch_mode='complete_episodes')`." ) - if not self.uses_new_env_runners and not self.in_evaluation: + if not self.enable_env_runner_and_connector_v2 and not self.in_evaluation: validate_buffer_config(self) if self.td_error_loss_fn not in ["huber", "mse"]: @@ -439,7 +439,7 @@ def validate(self) -> None: # TODO (simon): Find a clean solution to deal with # configuration configs when using the new API stack. if ( - not self._enable_new_api_stack + not self.enable_rl_module_and_learner and self.exploration_config["type"] == "ParameterNoise" ): if self.batch_mode != "complete_episodes": @@ -462,7 +462,7 @@ def validate(self) -> None: ) if ( - self.uses_new_env_runners + self.enable_env_runner_and_connector_v2 and not isinstance(self.replay_buffer_config["type"], str) and not issubclass(self.replay_buffer_config["type"], EpisodeReplayBuffer) ): @@ -588,7 +588,7 @@ def training_step(self) -> ResultDict: The results dict from executing the training iteration. """ # New API stack (RLModule, Learner, EnvRunner, ConnectorV2). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: return self._training_step_new_api_stack(with_noise_reset=True) # Old and hybrid API stacks (Policy, RolloutWorker, Connector, maybe RLModule, # maybe Learner). diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py index 437d35df82a1..1111813b645f 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3.py +++ b/rllib/algorithms/dreamerv3/dreamerv3.py @@ -149,7 +149,8 @@ def __init__(self, algo_class=None): # with RLlib's `RemoteVectorEnv`). self.remote_worker_envs = True # Dreamer only runs on the new API stack. - self._enable_new_api_stack = True + self.enable_rl_module_and_learner = True + self.enable_env_runner_and_connector_v2 = True # __sphinx_doc_end__ # fmt: on @@ -382,10 +383,10 @@ def validate(self) -> None: raise ValueError("DreamerV3 does NOT support multi-agent setups yet!") # Make sure, we are configure for the new API stack. - if not self._enable_new_api_stack: + if not self.enable_rl_module_and_learner: raise ValueError( - "DreamerV3 must be run with `config.experimental(" - "_enable_new_api_stack=True)`!" + "DreamerV3 must be run with `config.api_stack(" + "enable_rl_module_and_learner=True)`!" ) # If run on several Learners, the provided batch_size_B must be a multiple diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 118a590dc58c..9f2b22fc2aac 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -130,8 +130,8 @@ def __init__(self, algo_class=None): self.num_aggregation_workers = 0 self.grad_clip = 40.0 - # Note: Only when using _enable_new_api_stack=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by # global_norm, no matter the value of `grad_clip_by`. self.grad_clip_by = "global_norm" @@ -231,7 +231,7 @@ def training( each SGD iteration. If "auto", will use the same value as `train_batch_size`. Note that this setting only has an effect if - `_enable_new_api_stack=True` and it must be a multiple of + `enable_rl_module_and_learner=True` and it must be a multiple of `rollout_fragment_length` or `sequence_length` and smaller than or equal to `train_batch_size`. num_sgd_iter: Number of passes to make over each train batch. @@ -366,13 +366,13 @@ def validate(self) -> None: # New stack w/ EnvRunners does NOT support aggregation workers yet or a mixin # replay buffer. - if self.uses_new_env_runners: + if self.enable_env_runner_and_connector_v2: if self.num_aggregation_workers > 0: raise ValueError( "Aggregation workers not supported on new API stack w/ new " "EnvRunner API! Set `config.num_aggregation_workers = 0` or " "disable the new API stack via " - "`config.experimental(_enable_new_api_stack=False)`." + "`config.api_stack(enable_rl_module_and_learner=False)`." ) if self.replay_ratio != 0.0: raise ValueError( @@ -387,7 +387,7 @@ def validate(self) -> None: ) # Entropy coeff schedule checking. - if self._enable_new_api_stack: + if self.enable_rl_module_and_learner: if self.entropy_coeff_schedule is not None: raise ValueError( "`entropy_coeff_schedule` is deprecated and must be None! Use the " @@ -428,7 +428,7 @@ def validate(self) -> None: ) # Learner API specific checks. if ( - self._enable_new_api_stack + self.enable_rl_module_and_learner and self._minibatch_size != "auto" and not ( (self.minibatch_size % self.rollout_fragment_length == 0) @@ -458,7 +458,7 @@ def minibatch_size(self): return ( ( self.train_batch_size_per_learner - if self.uses_new_env_runners + if self.enable_env_runner_and_connector_v2 else self.train_batch_size ) if self._minibatch_size == "auto" @@ -554,9 +554,9 @@ class Impala(Algorithm): == Overview of data flow in IMPALA == 1. Policy evaluation in parallel across `num_workers` actors produces - batches of size `rollout_fragment_length * num_envs_per_worker`. + batches of size `rollout_fragment_length * num_envs_per_env_runner`. 2. If enabled, the replay buffer stores and produces batches of size - `rollout_fragment_length * num_envs_per_worker`. + `rollout_fragment_length * num_envs_per_env_runner`. 3. If enabled, the minibatch ring buffer stores and replays batches of size `train_batch_size` up to `num_sgd_iter` times per batch. 4. The learner thread executes data parallel SGD across `num_gpus` GPUs @@ -656,7 +656,7 @@ def setup(self, config: AlgorithmConfig): # update of the learner group self._results = {} - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: # Create and start the learner thread. self._learner_thread = make_learner_thread( self.workers.local_worker(), self.config @@ -667,7 +667,7 @@ def setup(self, config: AlgorithmConfig): def training_step(self) -> ResultDict: # First, check, whether our learner thread is still healthy. if ( - not self.config._enable_new_api_stack + not self.config.enable_rl_module_and_learner and not self._learner_thread.is_alive() ): raise RuntimeError("The learner thread died while training!") @@ -706,7 +706,7 @@ def training_step(self) -> ResultDict: self.concatenate_batches_and_pre_queue(batches) # Using the Learner API. Call `update()` on our LearnerGroup object with # all collected batches. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: train_results = self.learn_on_processed_samples() module_ids_to_update = set(train_results.keys()) - {ALL_MODULES} additional_results = self.learner_group.additional_update( @@ -734,7 +734,7 @@ def training_step(self) -> ResultDict: # Sync worker weights (only those policies that were actually updated). with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if train_results: pids = list(set(train_results.keys()) - {ALL_MODULES}) self.update_workers_from_learner_group( @@ -758,7 +758,7 @@ def training_step(self) -> ResultDict: mark_healthy=True, ) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if train_results: # Store the most recent result and return it if no new result is # available. This keeps backwards compatibility with the old @@ -821,9 +821,9 @@ def default_resource_request( else [] ) ) - # TODO(avnishn): Remove this once we have a way to extend placement group - # factories. - if cf._enable_new_api_stack: + # TODO (avnishn): Remove this once we have a way to extend placement group + # factories. + if cf.enable_rl_module_and_learner: # Resources for the Algorithm. learner_bundles = cls._get_learner_bundles(cf) @@ -981,8 +981,8 @@ def learn_on_processed_samples(self) -> ResultDict: def place_processed_samples_on_learner_thread_queue(self) -> None: """Place processed samples on the learner queue for training. - NOTE: This method is called if self.config._enable_new_api_stack is False. - + NOTE: This method is called if self.config.enable_rl_module_and_learner is + False. """ for i, batch in enumerate(self.batches_to_place_on_learner): try: @@ -1008,7 +1008,8 @@ def place_processed_samples_on_learner_thread_queue(self) -> None: def process_trained_results(self) -> ResultDict: """Process training results that are outputed by the learner thread. - NOTE: This method is called if self.config._enable_new_api_stack is False. + NOTE: This method is called if self.config.enable_rl_module_and_learner is + False. Returns: Aggregated results from the learner thread after an update is completed. @@ -1236,7 +1237,7 @@ def _compile_iteration_results_old_and_hybrid_api_stacks(self, *args, **kwargs): result = super()._compile_iteration_results_old_and_hybrid_api_stacks( *args, **kwargs ) - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: result = self._learner_thread.add_learner_metrics( result, overwrite_learner_info=False ) diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 555a15cbb8b4..6b8b592454f9 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -178,7 +178,7 @@ def compute_gradients_fn( self, optimizer: LocalOptimizer, loss: TensorType ) -> ModelGradients: # Supporting more than one loss/optimizer. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # In order to access the variables for rl modules, we need to # use the underlying keras api model.trainable_variables. trainable_variables = self.model.trainable_variables @@ -302,7 +302,7 @@ def __init__( # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not self.config.get("_enable_new_api_stack"): + if not self.config.get("enable_rl_module_and_learner"): GradStatsMixin.__init__(self) VTraceClipGradients.__init__(self) VTraceOptimizer.__init__(self) diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index c6f9a19eeb57..547c2405acb0 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -242,7 +242,7 @@ def __init__(self, observation_space, action_space, config): # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not config.get("_enable_new_api_stack"): + if not config.get("enable_rl_module_and_learner"): VTraceOptimizer.__init__(self) # Need to initialize learning rate variable before calling # TorchPolicyV2.__init__. diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index b4b90cb8305a..2e55e21eabae 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -58,7 +58,7 @@ def test_impala_loss(self): """ config = ( ImpalaConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/impala/tests/test_impala_off_policyness.py b/rllib/algorithms/impala/tests/test_impala_off_policyness.py index 0cf8ec62875c..d9f3d7ecf621 100644 --- a/rllib/algorithms/impala/tests/test_impala_off_policyness.py +++ b/rllib/algorithms/impala/tests/test_impala_off_policyness.py @@ -23,7 +23,7 @@ def tearDownClass(cls) -> None: def test_impala_off_policyness(self): config = ( impala.ImpalaConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .resources(num_gpus=0) .env_runners(num_env_runners=4) diff --git a/rllib/algorithms/impala/tf/impala_tf_learner.py b/rllib/algorithms/impala/tf/impala_tf_learner.py index 1f93aff30767..9d0c084a25b8 100644 --- a/rllib/algorithms/impala/tf/impala_tf_learner.py +++ b/rllib/algorithms/impala/tf/impala_tf_learner.py @@ -60,7 +60,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/impala/torch/impala_torch_learner.py b/rllib/algorithms/impala/torch/impala_torch_learner.py index c93a5c885c6c..ff061ab257b7 100644 --- a/rllib/algorithms/impala/torch/impala_torch_learner.py +++ b/rllib/algorithms/impala/torch/impala_torch_learner.py @@ -68,7 +68,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index fcbffe42acbc..d30909dcfce1 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -332,7 +332,7 @@ def validate(self) -> None: # we subsample a batch of `sgd_minibatch_size` from the train-batch for # each `num_sgd_iter`). if ( - not self._enable_new_api_stack + not self.enable_rl_module_and_learner and self.sgd_minibatch_size > self.train_batch_size ): raise ValueError( @@ -342,7 +342,7 @@ def validate(self) -> None: f"is iterated over (used for updating the policy) {self.num_sgd_iter} " "times." ) - elif self._enable_new_api_stack: + elif self.enable_rl_module_and_learner: mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size tbs = self.train_batch_size_per_learner or self.train_batch_size if isinstance(mbs, int) and isinstance(tbs, int) and mbs > tbs: @@ -370,7 +370,7 @@ def validate(self) -> None: ) # Entropy coeff schedule checking. - if self._enable_new_api_stack: + if self.enable_rl_module_and_learner: if self.entropy_coeff_schedule is not None: raise ValueError( "`entropy_coeff_schedule` is deprecated and must be None! Use the " @@ -418,7 +418,7 @@ def get_default_policy_class( @override(Algorithm) def training_step(self): # New API stack (RLModule, Learner, EnvRunner, ConnectorV2). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: return self._training_step_new_api_stack() # Old and hybrid API stacks (Policy, RolloutWorker, Connector, maybe RLModule, # maybe Learner). @@ -434,7 +434,9 @@ def _training_step_new_api_stack(self) -> ResultDict: worker_set=self.workers, max_agent_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) else: @@ -442,7 +444,9 @@ def _training_step_new_api_stack(self) -> ResultDict: worker_set=self.workers, max_env_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) # Return early if all our workers failed. @@ -584,7 +588,7 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: train_batch = standardize_fields(train_batch, ["advantages"]) # Perform a train step on the collected batch. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: mini_batch_size_per_learner = ( self.config.mini_batch_size_per_learner or self.config.sgd_minibatch_size @@ -600,7 +604,7 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: else: train_results = multi_gpu_train_one_step(self, train_batch) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: # The train results's loss keys are pids to their loss values. But we also # return a total_loss key at the same level as the pid keys. So we need to # subtract that to get the total set of pids to update. @@ -626,7 +630,7 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: if self.workers.num_remote_workers() > 0: from_worker_or_learner_group = None - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: # sync weights from learner_group to all rollout workers from_worker_or_learner_group = self.learner_group self.workers.sync_weights( @@ -634,11 +638,11 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: policies=policies_to_update, global_vars=global_vars, ) - elif self.config._enable_new_api_stack: + elif self.config.enable_rl_module_and_learner: weights = self.learner_group.get_weights() self.workers.local_worker().set_weights(weights) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: kl_dict = {} if self.config.use_kl_loss: for pid in policies_to_update: diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py index d31c8e47552e..9b6c6f3d1876 100644 --- a/rllib/algorithms/ppo/ppo_learner.py +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -60,7 +60,7 @@ def _update_from_batch_or_episodes( ): # First perform GAE computation on the entirety of the given train data (all # episodes). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: batch, episodes = self._compute_gae_from_episodes(episodes=episodes) # Now that GAE (advantages and value targets) have been added to the train # batch, we can proceed normally (calling super method) with the update step. diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index 8541995302e0..57988d3df6e8 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -128,9 +128,6 @@ def test_ppo_compilation_w_connectors(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() - # TODO (Kourosh): Enable when the scheduler is supported in the new - # Learner API stack. - .experimental(_enable_new_api_stack=False) .training( num_sgd_iter=2, # Setup lr schedule for testing. @@ -202,9 +199,6 @@ def test_ppo_compilation_and_schedule_mixins(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() - # TODO (Kourosh): Enable when the scheduler is supported in the new - # Learner API stack. - .experimental(_enable_new_api_stack=False) .training( # Setup lr schedule for testing. lr_schedule=[[0, 5e-5], [256, 0.0]], @@ -275,11 +269,11 @@ def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" config = ( ppo.PPOConfig() - # .experimental(_enable_new_api_stack=True) .environment( "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, - ).env_runners( + ) + .env_runners( # Run locally. num_env_runners=0, ) @@ -325,9 +319,6 @@ def test_ppo_free_log_std(self): config = ( ppo.PPOConfig() - # TODO (Kourosh): Enable when free log std is supported in the new - # Learner API stack. - .experimental(_enable_new_api_stack=False) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -392,7 +383,6 @@ def test_ppo_loss_function(self): """ config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=False) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 2d897caca974..cc48fc6b1ca9 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -56,7 +56,7 @@ def tearDownClass(cls): def test_loss(self): config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -104,7 +104,7 @@ def test_save_load_state(self): """Tests saving and loading the state of the PPO Learner Group.""" config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -142,7 +142,7 @@ def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py index b98a60551ee2..09ff35ac9eaf 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py @@ -11,7 +11,6 @@ LEARNER_RESULTS_CURR_LR_KEY, ) -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.utils.metrics import LEARNER_RESULTS from ray.rllib.utils.test_utils import ( @@ -75,11 +74,11 @@ def test_ppo_compilation_and_schedule_mixins(self): config = ( ppo.PPOConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) - .env_runners( - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(num_env_runners=0) .training( num_sgd_iter=2, # Setup lr schedule for testing lr-scheduling correctness. diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 724f64374ed2..6e9810093c96 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -78,7 +78,7 @@ def test_ppo_compilation_and_schedule_mixins(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .training( num_sgd_iter=2, # Setup lr schedule for testing lr-scheduling correctness. @@ -137,7 +137,7 @@ def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment( "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, @@ -181,7 +181,7 @@ def test_ppo_free_log_std_with_rl_modules(self): """Tests the free log std option works.""" config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("Pendulum-v1") .env_runners( num_env_runners=1, diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 29527bf2d115..c58170cc44e7 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -217,7 +217,7 @@ def training( collecting samples from the env). If None, uses "natural" values of: `train_batch_size` / (`rollout_fragment_length` x `num_workers` x - `num_envs_per_worker`). + `num_envs_per_env_runner`). If not None, will make sure that the ratio between timesteps inserted into and sampled from th buffer matches the given values. Example: @@ -225,7 +225,7 @@ def training( train_batch_size=250 rollout_fragment_length=1 num_workers=1 (or 0) - num_envs_per_worker=1 + num_envs_per_env_runner=1 -> natural value = 250 / 1 = 250.0 -> will make sure that replay+train op will be executed 4x asoften as rollout+insert op (4 * 250 = 1000). @@ -347,7 +347,9 @@ def validate(self) -> None: # Validate that we use the corresponding `EpisodeReplayBuffer` when using # episodes. # TODO (sven, simon): Implement the multi-agent case for replay buffers. - if self.uses_new_env_runners and self.replay_buffer_config["type"] not in [ + if self.enable_env_runner_and_connector_v2 and self.replay_buffer_config[ + "type" + ] not in [ "EpisodeReplayBuffer", "PrioritizedEpisodeReplayBuffer", ]: @@ -445,7 +447,7 @@ def training_step(self) -> ResultDict: The results dict from executing the training iteration. """ # New API stack (RLModule, Learner, EnvRunner, ConnectorV2). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: return self._training_step_new_api_stack(with_noise_reset=False) # Old and hybrid API stacks (Policy, RolloutWorker, Connector, maybe RLModule, # maybe Learner). diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index fc4d9e62f7d7..aed916921967 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -171,7 +171,7 @@ def test_detect_atari_env(self): def test_rl_module_api(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .framework("torch") .env_runners(enable_connectors=True) @@ -231,7 +231,7 @@ def test_config_per_module(self): def test_learner_api(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners(enable_connectors=True) .framework("tf2") @@ -360,7 +360,7 @@ def get_default_rl_module_spec(self): ######################################## # This is the simplest case where we have to construct the marl module based on # the default specs only. - config = SingleAgentAlgoConfig().experimental(_enable_new_api_stack=True) + config = SingleAgentAlgoConfig().api_stack(enable_rl_module_and_learner=True) spec, expected = self._get_expected_marl_spec(config, DiscreteBCTorchModule) self._assertEqualMARLSpecs(spec, expected) @@ -376,7 +376,7 @@ def get_default_rl_module_spec(self): # algorithm to assign a specific type of RLModule class to certain module_ids. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=MultiAgentRLModuleSpec( module_specs={ @@ -395,7 +395,7 @@ def get_default_rl_module_spec(self): # RLModule class to ALL module_ids. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec(module_class=CustomRLModule1), ) @@ -414,7 +414,7 @@ def get_default_rl_module_spec(self): # RLModule class to ALL module_ids. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=MultiAgentRLModuleSpec( module_specs=SingleAgentRLModuleSpec(module_class=CustomRLModule1) @@ -437,7 +437,7 @@ def get_default_rl_module_spec(self): # in the multi-agent scenario. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=MultiAgentRLModuleSpec( marl_module_class=CustomMARLModule1, @@ -474,8 +474,8 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, but the MultiAgentRLModuleSpec has not defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfigWithNoSingleAgentSpec().experimental( - _enable_new_api_stack=True + config = MultiAgentAlgoConfigWithNoSingleAgentSpec().api_stack( + enable_rl_module_and_learner=True ) self.assertRaisesRegex( @@ -488,7 +488,7 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, and the MultiAgentRLModuleSpec has defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfig().experimental(_enable_new_api_stack=True) + config = MultiAgentAlgoConfig().api_stack(enable_rl_module_and_learner=True) spec, expected = self._get_expected_marl_spec( config, DiscreteBCTorchModule, expected_marl_module_class=CustomMARLModule1 diff --git a/rllib/algorithms/tests/test_algorithm_export_checkpoint.py b/rllib/algorithms/tests/test_algorithm_export_checkpoint.py index 8a3579cb5339..d5ddec5c79f7 100644 --- a/rllib/algorithms/tests/test_algorithm_export_checkpoint.py +++ b/rllib/algorithms/tests/test_algorithm_export_checkpoint.py @@ -29,7 +29,7 @@ def save_test(alg_name, framework="tf", multi_agent=False): ) if alg_name in RLMODULE_SUPPORTED_ALGOS: - config = config.experimental(_enable_new_api_stack=False) + config = config.api_stack(enable_rl_module_and_learner=False) if "DDPG" in alg_name or "SAC" in alg_name: config.environment("Pendulum-v1") diff --git a/rllib/algorithms/tests/test_callbacks_old_stack.py b/rllib/algorithms/tests/test_callbacks_old_stack.py index f01b86151711..11fea5c94a7a 100644 --- a/rllib/algorithms/tests/test_callbacks_old_stack.py +++ b/rllib/algorithms/tests/test_callbacks_old_stack.py @@ -78,7 +78,7 @@ def test_episode_and_sample_callbacks(self): config = ( PPOConfig() .environment("CartPole-v1") - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .callbacks(EpisodeAndSampleCallbacks) .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) ) @@ -99,7 +99,7 @@ def test_on_sub_environment_created(self): dqn.DQNConfig().environment("CartPole-v1") # Create 4 sub-environments per remote worker. # Create 2 remote workers. - .env_runners(num_envs_per_worker=4, num_rollout_workers=2) + .env_runners(num_envs_per_env_runner=4, num_env_runners=2) ) for callbacks in ( @@ -135,10 +135,10 @@ def test_on_sub_environment_created_with_remote_envs(self): # Make each sub-environment a ray actor. remote_worker_envs=True, # Create 2 remote workers. - num_rollout_workers=2, + num_env_runners=2, # Create 4 sub-environments (ray remote actors) per remote # worker. - num_envs_per_worker=4, + num_envs_per_env_runner=4, ) ) @@ -179,7 +179,7 @@ def test_on_episode_created(self): "p_terminated": 0.0, }, ) - .env_runners(num_envs_per_worker=2, num_rollout_workers=1) + .env_runners(num_envs_per_env_runner=2, num_env_runners=1) .callbacks(OnEpisodeCreatedCallback) ) diff --git a/rllib/algorithms/tests/test_callbacks_on_algorithm.py b/rllib/algorithms/tests/test_callbacks_on_algorithm.py index d9f6b4cdc718..c3533ab6ac8b 100644 --- a/rllib/algorithms/tests/test_callbacks_on_algorithm.py +++ b/rllib/algorithms/tests/test_callbacks_on_algorithm.py @@ -58,7 +58,7 @@ def test_on_workers_recreated_callback(self): APPOConfig() .environment("env") .callbacks(OnWorkersRecreatedCallbacks) - .env_runners(num_rollout_workers=3) + .env_runners(num_env_runners=3) .fault_tolerance( recreate_failed_env_runners=True, delay_between_env_runner_restarts_s=0, diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index 02b007eab2aa..062f39a99f01 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -4,7 +4,6 @@ import ray from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.utils.test_utils import framework_iterator @@ -72,12 +71,14 @@ def tearDownClass(cls): def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") .env_runners( - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", - env_runner_cls=SingleAgentEnvRunner, ) .callbacks(EpisodeAndSampleCallbacks) .training( @@ -115,12 +116,14 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") .env_runners( batch_mode="complete_episodes", - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + num_env_runners=0, ) .callbacks(EpisodeAndSampleCallbacks) .training( @@ -158,8 +161,10 @@ def test_overriding_on_episode_created_throws_error_on_new_api_stack(self): """Tests, whw""" config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) - .env_runners(env_runner_cls=SingleAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .callbacks(OnEpisodeCreatedCallback) ) self.assertRaises(ValueError, lambda: config.validate()) diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py index 100643ae6ee8..96e951119d69 100644 --- a/rllib/algorithms/tests/test_worker_failures.py +++ b/rllib/algorithms/tests/test_worker_failures.py @@ -389,9 +389,11 @@ def test_fatal_single_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( - env_runner_cls=SingleAgentEnvRunner, env_to_module_connector=lambda env: FlattenObservations(), ) ) @@ -400,8 +402,10 @@ def test_fatal_multi_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( PPOConfig() - .experimental(_enable_new_api_stack=True) - .env_runners(env_runner_cls=MultiAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .multi_agent(policies={"p0"}, policy_mapping_fn=lambda *a, **k: "p0"), ) @@ -409,7 +413,10 @@ def test_fatal_multi_agent(self): # def test_async_samples(self): # self._do_test_fault_ignore( # ImpalaConfig() - # .experimental(_enable_new_api_stack=True) + # .api_stack( + # enable_rl_module_and_learner=True, + # enable_env_runners_and_connector_v2=True, + # ) # .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) # .resources(num_gpus=0) # ) @@ -417,7 +424,10 @@ def test_fatal_multi_agent(self): def test_sync_replay(self): self._do_test_failing_ignore( SACConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment( env_config={"action_space": gym.spaces.Box(0, 1, (2,), np.float32)} ) @@ -429,10 +439,11 @@ def test_sync_replay(self): def test_multi_gpu(self): self._do_test_failing_ignore( PPOConfig() - .experimental(_enable_new_api_stack=True) - .env_runners( - env_runner_cls=ForwardHealthCheckToEnvWorker, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training( train_batch_size=10, sgd_minibatch_size=1, @@ -443,7 +454,10 @@ def test_multi_gpu(self): def test_sync_samples(self): self._do_test_failing_ignore( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(optimizer={}) ) @@ -452,7 +466,10 @@ def test_eval_workers_failing_ignore(self): # Test the case where one eval worker fails, but we chose to ignore. self._do_test_failing_ignore( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(model={"fcnet_hiddens": [4]}), fail_eval=True, @@ -462,7 +479,10 @@ def test_eval_workers_parallel_to_training_failing_recover(self): # Test the case where all eval workers fail, but we chose to recover. config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .evaluation( evaluation_num_env_runners=1, @@ -482,7 +502,10 @@ def test_eval_workers_parallel_to_training_multi_agent_failing_recover( # to recover. config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent) .multi_agent( policies={"main", "p0", "p1"}, @@ -518,7 +541,10 @@ def test_workers_failing_recover(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -574,7 +600,10 @@ def test_modules_are_restored_on_recovered_worker(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent, num_env_runners=2, @@ -678,7 +707,10 @@ def test_eval_workers_failing_recover(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -746,7 +778,10 @@ def test_worker_failing_recover_with_hanging_workers(self): # the execution of the algorithm b/c of a single heavily stalling worker. # Timeout data (batches or episodes) are discarded. SACConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .training( replay_buffer_config={"type": "EpisodeReplayBuffer"}, ) diff --git a/rllib/connectors/agent/state_buffer.py b/rllib/connectors/agent/state_buffer.py index 91e22990560a..bb235db2ab8d 100644 --- a/rllib/connectors/agent/state_buffer.py +++ b/rllib/connectors/agent/state_buffer.py @@ -33,7 +33,9 @@ def __init__(self, ctx: ConnectorContext, states: Any = None): self._action_space_struct = get_base_struct_from_space(ctx.action_space) self._states = defaultdict(lambda: defaultdict(lambda: (None, None, None))) - self._enable_new_api_stack = ctx.config.get("_enable_new_api_stack", False) + self._enable_new_api_stack = ctx.config.get( + "enable_rl_module_and_learner", False + ) # TODO(jungong) : we would not need this if policies are never stashed # during the rollout of a single episode. if states: @@ -65,7 +67,7 @@ def on_policy_output(self, ac_data: ActionConnectorDataType): def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType: d = ac_data.data assert ( - type(d) == dict + type(d) is dict ), "Single agent data must be of type Dict[str, TensorStructType]" env_id = ac_data.env_id diff --git a/rllib/connectors/agent/view_requirement.py b/rllib/connectors/agent/view_requirement.py index 1a079792bd32..7bfe7270102c 100644 --- a/rllib/connectors/agent/view_requirement.py +++ b/rllib/connectors/agent/view_requirement.py @@ -36,7 +36,7 @@ def __init__(self, ctx: ConnectorContext): super().__init__(ctx) self._view_requirements = ctx.view_requirements - _enable_new_api_stack = ctx.config.get("_enable_new_api_stack", False) + _enable_new_api_stack = ctx.config.get("enable_rl_module_and_learner", False) # a dict of env_id to a dict of agent_id to a list of agent_collector objects self.agent_collectors = defaultdict( @@ -68,7 +68,7 @@ def reset(self, env_id: str): def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType: d = ac_data.data assert ( - type(d) == dict + type(d) is dict ), "Single agent data must be of type Dict[str, TensorStructType]" env_id = ac_data.env_id diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 3f58372061c6..43229e79ea30 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -299,7 +299,7 @@ def build(self) -> None: return # Build learner connector pipeline used on this Learner worker. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # TODO (sven): Figure out which space to provide here. For now, # it doesn't matter, as the default connector piece doesn't use # this information anyway. diff --git a/rllib/core/models/tests/test_catalog.py b/rllib/core/models/tests/test_catalog.py index b253feb1a5e8..f7f81074b32a 100644 --- a/rllib/core/models/tests/test_catalog.py +++ b/rllib/core/models/tests/test_catalog.py @@ -388,7 +388,7 @@ def build_vf_head(self, framework): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec(catalog_class=MyCatalog), ) diff --git a/rllib/core/testing/tests/test_bc_algorithm.py b/rllib/core/testing/tests/test_bc_algorithm.py index a41250c43c4e..fe798a4a4846 100644 --- a/rllib/core/testing/tests/test_bc_algorithm.py +++ b/rllib/core/testing/tests/test_bc_algorithm.py @@ -33,8 +33,8 @@ def test_bc_algorithm(self): config = ( BCConfigTest() + .api_stack(enable_rl_module_and_learner=True) .training(model={"fcnet_hiddens": [32, 32]}) - .experimental(_enable_new_api_stack=True) ) # TODO (Kourosh): Add tf2 support @@ -54,7 +54,7 @@ def test_bc_algorithm_marl(self): policies = {"policy_1", "policy_2"} config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .training(model={"fcnet_hiddens": [32, 32]}) .multi_agent( policies=policies, @@ -98,7 +98,7 @@ def test_bc_algorithm_w_custom_marl_module(self): config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework(fw) .rl_module(rl_module_spec=spec) .training( diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 67bc34f8885b..cf2facb1b0ae 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -3,6 +3,7 @@ from collections import defaultdict from functools import partial +import numpy as np from typing import DefaultDict, Dict, List, Optional from ray.rllib.algorithms.algorithm_config import AlgorithmConfig @@ -624,37 +625,18 @@ def get_metrics(self) -> ResultDict: module_episode_returns[sa_eps.module_id] += return_eps2 del self._ongoing_episodes_for_metrics[eps.id_] - # Log general episode metrics. - self.metrics.log_dict( - { - "episode_len_mean": episode_length, - "episode_return_mean": episode_return, - "episode_duration_sec_mean": episode_duration_s, - # Per-agent returns. - "agent_episode_returns_mean": agent_episode_returns, - # Per-RLModule returns. - "module_episode_returns_mean": module_episode_returns, - }, - # To mimick the old API stack behavior, we'll use `window` here for - # these particular stats (instead of the default EMA). - window=self.config.metrics_num_episodes_for_smoothing, - ) - # For some metrics, log min/max as well. - self.metrics.log_dict( - { - "episode_len_min": episode_length, - "episode_return_min": episode_return, - }, - reduce="min", - ) - self.metrics.log_dict( - { - "episode_len_max": episode_length, - "episode_return_max": episode_return, - }, - reduce="max", + self._log_episode_metrics( + episode_length, + episode_return, + episode_duration_s, + agent_episode_returns, + module_episode_returns, ) + # If no episodes at all, log NaN stats. + if len(self._done_episodes_for_metrics) == 0: + self._log_episode_metrics(np.nan, np.nan, np.nan) + # Log num episodes counter for this iteration. self.metrics.log_value( NUM_EPISODES, @@ -758,7 +740,7 @@ def make_env(self): env_ctx = EnvContext( env_ctx, worker_index=self.worker_index, - num_workers=self.config.num_rollout_workers, + num_workers=self.config.num_env_runners, remote=self.config.remote_worker_envs, ) @@ -865,3 +847,41 @@ def _make_on_episode_callback(self, which: str, episode=None): rl_module=self.module, env_index=0, ) + + def _log_episode_metrics(self, length, ret, sec, agents=None, modules=None): + # Log general episode metrics. + self.metrics.log_dict( + { + "episode_len_mean": length, + "episode_return_mean": ret, + "episode_duration_sec_mean": sec, + **( + { + # Per-agent returns. + "agent_episode_returns_mean": agents, + # Per-RLModule returns. + "module_episode_returns_mean": modules, + } + if agents is not None + else {} + ), + }, + # To mimick the old API stack behavior, we'll use `window` here for + # these particular stats (instead of the default EMA). + window=self.config.metrics_num_episodes_for_smoothing, + ) + # For some metrics, log min/max as well. + self.metrics.log_dict( + { + "episode_len_min": length, + "episode_return_min": ret, + }, + reduce="min", + ) + self.metrics.log_dict( + { + "episode_len_max": length, + "episode_return_max": ret, + }, + reduce="max", + ) diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 5a4457dd7762..a3610c8e9162 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -4,6 +4,7 @@ from collections import defaultdict from functools import partial +import numpy as np from typing import DefaultDict, Dict, List, Optional from ray.rllib.algorithms.algorithm_config import AlgorithmConfig @@ -587,37 +588,14 @@ def get_metrics(self) -> ResultDict: episode_duration_s += eps2.get_duration_s() del self._ongoing_episodes_for_metrics[eps.id_] - # Log general episode metrics. - self.metrics.log_dict( - { - "episode_len_mean": episode_length, - "episode_return_mean": episode_return, - "episode_duration_sec_mean": episode_duration_s, - # Per-agent returns. - "agent_episode_returns_mean": {DEFAULT_AGENT_ID: episode_return}, - # Per-RLModule returns. - "module_episode_returns_mean": {DEFAULT_MODULE_ID: episode_return}, - }, - # To mimick the old API stack behavior, we'll use `window` here for - # these particular stats (instead of the default EMA). - window=self.config.metrics_num_episodes_for_smoothing, - ) - # For some metrics, log min/max as well. - self.metrics.log_dict( - { - "episode_len_min": episode_length, - "episode_return_min": episode_return, - }, - reduce="min", - ) - self.metrics.log_dict( - { - "episode_len_max": episode_length, - "episode_return_max": episode_return, - }, - reduce="max", + self._log_episode_metrics( + episode_length, episode_return, episode_duration_s ) + # If no episodes at all, log NaN stats. + if len(self._done_episodes_for_metrics) == 0: + self._log_episode_metrics(np.nan, np.nan, np.nan) + # Log num episodes counter for this iteration. self.metrics.log_value( NUM_EPISODES, @@ -704,7 +682,7 @@ def make_env(self) -> None: env_ctx = EnvContext( env_ctx, worker_index=self.worker_index, - num_workers=self.config.num_rollout_workers, + num_workers=self.config.num_env_runners, remote=self.config.remote_worker_envs, ) @@ -730,12 +708,12 @@ def make_env(self) -> None: self.env: gym.Wrapper = gym.wrappers.VectorListInfo( gym.vector.make( "rllib-single-agent-env-v0", - num_envs=self.config.num_envs_per_worker, + num_envs=self.config.num_envs_per_env_runner, asynchronous=self.config.remote_worker_envs, ) ) self.num_envs: int = self.env.num_envs - assert self.num_envs == self.config.num_envs_per_worker + assert self.num_envs == self.config.num_envs_per_env_runner # Set the flag to reset all envs upon the next `sample()` call. self._needs_initial_reset = True @@ -776,3 +754,35 @@ def _convert_to_tensor(self, struct) -> TensorType: return convert_to_torch_tensor(struct) else: return tree.map_structure(tf.convert_to_tensor, struct) + + def _log_episode_metrics(self, length, ret, sec): + # Log general episode metrics. + self.metrics.log_dict( + { + "episode_len_mean": length, + "episode_return_mean": ret, + "episode_duration_sec_mean": sec, + # Per-agent returns. + "agent_episode_returns_mean": {DEFAULT_AGENT_ID: ret}, + # Per-RLModule returns. + "module_episode_returns_mean": {DEFAULT_MODULE_ID: ret}, + }, + # To mimick the old API stack behavior, we'll use `window` here for + # these particular stats (instead of the default EMA). + window=self.config.metrics_num_episodes_for_smoothing, + ) + # For some metrics, log min/max as well. + self.metrics.log_dict( + { + "episode_len_min": length, + "episode_return_min": ret, + }, + reduce="min", + ) + self.metrics.log_dict( + { + "episode_len_max": length, + "episode_return_max": ret, + }, + reduce="max", + ) diff --git a/rllib/env/tests/test_multi_agent_env_runner.py b/rllib/env/tests/test_multi_agent_env_runner.py index d27f4779bd67..1f7f51243afb 100644 --- a/rllib/env/tests/test_multi_agent_env_runner.py +++ b/rllib/env/tests/test_multi_agent_env_runner.py @@ -95,12 +95,14 @@ def _build_config(self): # Build the configuration and use `PPO`. config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment( MultiAgentCartPole, env_config={"num_agents": 2}, ) - .env_runners(env_runner_cls=MultiAgentEnvRunner) # TODO (sven, simon): Setup is still for `Policy`, change as soon # as we have switched fully to the new stack. .multi_agent( diff --git a/rllib/env/tests/test_single_agent_env_runner.py b/rllib/env/tests/test_single_agent_env_runner.py index 2045963e899c..83c7bf083c22 100644 --- a/rllib/env/tests/test_single_agent_env_runner.py +++ b/rllib/env/tests/test_single_agent_env_runner.py @@ -18,7 +18,7 @@ def test_sample(self): config = ( AlgorithmConfig().environment("CartPole-v1") # Vectorize x2 and by default, rollout 64 timesteps per individual env. - .env_runners(num_envs_per_worker=2, rollout_fragment_length=64) + .env_runners(num_envs_per_env_runner=2, rollout_fragment_length=64) ) env_runner = SingleAgentEnvRunner(config=config) @@ -64,8 +64,8 @@ def test_distributed_env_runner(self): AlgorithmConfig().environment("CartPole-v1") # Vectorize x2 and by default, rollout 64 timesteps per individual env. .env_runners( - num_rollout_workers=5, - num_envs_per_worker=5, + num_env_runners=5, + num_envs_per_env_runner=5, rollout_fragment_length=10, remote_worker_envs=envs_parallel, ) @@ -73,7 +73,7 @@ def test_distributed_env_runner(self): array = [ remote_class.remote(config=config) - for _ in range(config.num_rollout_workers) + for _ in range(config.num_env_runners) ] # Sample in parallel. results = [a.sample.remote(random_actions=True) for a in array] @@ -83,7 +83,7 @@ def test_distributed_env_runner(self): # Assert length of all fragments is `rollout_fragment_length`. self.assertEqual( sum(len(e) for e in episodes), - config.num_envs_per_worker * config.rollout_fragment_length, + config.num_envs_per_env_runner * config.rollout_fragment_length, ) diff --git a/rllib/evaluation/env_runner_v2.py b/rllib/evaluation/env_runner_v2.py index 2c095fe4b12c..fc488f8e8ee2 100644 --- a/rllib/evaluation/env_runner_v2.py +++ b/rllib/evaluation/env_runner_v2.py @@ -181,7 +181,7 @@ def _build_multi_agent_batch( policy = collector.policy - if policy.config.get("_enable_new_api_stack", False): + if policy.config.get("enable_rl_module_and_learner", False): # Before we send the collected batch back for training, we may need # to add a time dimension for the RLModule. seq_lens = batch.get(SampleBatch.SEQ_LENS) @@ -1072,7 +1072,7 @@ def _try_find_policy_again(eval_data: AgentConnectorDataType): # changed (mapping fn not staying constant within one episode). policy: Policy = _try_find_policy_again(eval_data) - if policy.config.get("_enable_new_api_stack", False): + if policy.config.get("enable_rl_module_and_learner", False): # _batch_inference_sample_batches does nothing but concatenating AND # setting SEQ_LENS to ones in the recurrent case. We do not need this # because RLModules do not care about SEQ_LENS anymore. They have an @@ -1147,11 +1147,13 @@ def _process_policy_eval_results( input_dict: TensorStructType = eval_data[i].data.raw_dict rnn_states: List[StateBatches] = tree.map_structure( - lambda x: x[i], rnn_out + lambda x, i=i: x[i], rnn_out ) # extra_action_out could be a nested dict - fetches: Dict = tree.map_structure(lambda x: x[i], extra_action_out) + fetches: Dict = tree.map_structure( + lambda x, i=i: x[i], extra_action_out + ) # Post-process policy output by running them through action connectors. ac_data = ActionConnectorDataType( diff --git a/rllib/evaluation/episode_v2.py b/rllib/evaluation/episode_v2.py index 25ed5a36a719..b4d15f94548c 100644 --- a/rllib/evaluation/episode_v2.py +++ b/rllib/evaluation/episode_v2.py @@ -191,7 +191,9 @@ def add_init_obs( ), is_policy_recurrent=policy.is_recurrent(), intial_states=policy.get_initial_state(), - _enable_new_api_stack=policy.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=policy.config.get( + "enable_rl_module_and_learner", False + ), ) self._agent_collectors[agent_id].add_init_obs( episode_id=self.episode_id, diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 65ebfc3350c5..6c8afce541d5 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -270,7 +270,7 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample input_dict = sample_batch.get_single_step_input_dict( policy.view_requirements, index="last" ) - if policy.config.get("_enable_new_api_stack"): + if policy.config.get("enable_rl_module_and_learner"): # Note: During sampling you are using the parameters at the beginning of # the sampling process. If I'll be using this advantages during training # should it not be the latest parameters during training for this to be diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 0c1148c8daf8..d7fb7101fee8 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -492,7 +492,7 @@ def wrap(env): ) # This is only for the old API where local_worker was responsible for learning - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: # Error if we don't find enough GPUs. if ( ray.is_initialized() @@ -537,7 +537,7 @@ def wrap(env): # state. for pol in self.policy_map.values(): if not pol._model_init_state_automatically_added and not pol.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ): pol._update_model_view_requirements_from_init_state() @@ -695,7 +695,7 @@ def sample(self, **kwargs) -> SampleBatchType: self.config.batch_mode == "truncate_episodes" and not self.config.offline_sampling ): - max_batches = self.config.num_envs_per_worker + max_batches = self.config.num_envs_per_env_runner else: max_batches = float("inf") while steps_so_far < self.total_rollout_fragment_length and ( @@ -1121,7 +1121,7 @@ def add_policy( """ validate_policy_id(policy_id, error=False) - if module_spec is not None and not self.config._enable_new_api_stack: + if module_spec is not None and not self.config.enable_rl_module_and_learner: raise ValueError( "If you pass in module_spec to the policy, the RLModule API needs " "to be enabled." @@ -1714,7 +1714,7 @@ def _update_policy_map( updated_policy_dict = self._get_complete_policy_specs_dict(policy_dict) # Use the updated policy dict to create the marl_module_spec if necessary - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: spec = self.config.get_marl_module_spec( policy_dict=updated_policy_dict, single_agent_rl_module_spec=single_agent_rl_module_spec, @@ -1793,7 +1793,7 @@ def _get_complete_policy_specs_dict( obs_space, merged_conf.model, include_multi_binary=self.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ), ) # Original observation space should be accessible at @@ -1860,9 +1860,9 @@ def _build_policy_map( new_policy = policy # Maybe torch compile an RLModule. - if self.config.get("_enable_new_api_stack", False) and self.config.get( - "torch_compile_worker" - ): + if self.config.get( + "enable_rl_module_and_learner", False + ) and self.config.get("torch_compile_worker"): if self.config.framework_str != "torch": raise ValueError("Attempting to compile a non-torch RLModule.") rl_module = getattr(new_policy, "model", None) diff --git a/rllib/evaluation/tests/test_envs_that_crash.py b/rllib/evaluation/tests/test_envs_that_crash.py index 11e74a857417..7472e496b0d6 100644 --- a/rllib/evaluation/tests/test_envs_that_crash.py +++ b/rllib/evaluation/tests/test_envs_that_crash.py @@ -60,7 +60,7 @@ def test_env_crash_on_one_worker_during_sampling_but_ignore(self): """Expect some sub-envs on one worker to fail (and not recover), but ignore.""" config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( num_env_runners=2, num_envs_per_env_runner=3, @@ -98,9 +98,8 @@ def test_env_crash_on_one_worker_during_sampling_but_recreate_worker(self): """Expect some sub-envs to fail (and not recover), but re-create worker.""" config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( - # env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, rollout_fragment_length=10, num_envs_per_env_runner=3, diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index 64ad008dd205..6540cb187bc3 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -684,7 +684,7 @@ def test_truncate_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( rollout_fragment_length=15, - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", ), ) @@ -700,7 +700,7 @@ def test_truncate_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig() .env_runners( - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", rollout_fragment_length=301, ) @@ -725,7 +725,7 @@ def test_truncate_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig() .env_runners( - num_rollout_workers=0, + num_env_runners=0, rollout_fragment_length=301, ) .multi_agent( @@ -754,7 +754,7 @@ def test_complete_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( rollout_fragment_length=5, - num_rollout_workers=0, + num_env_runners=0, batch_mode="complete_episodes", ), ) @@ -768,7 +768,7 @@ def test_complete_episodes_packing(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( rollout_fragment_length=15, - num_rollout_workers=0, + num_env_runners=0, batch_mode="complete_episodes", ), ) @@ -786,7 +786,7 @@ def test_filter_sync(self): env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( - num_rollout_workers=0, + num_env_runners=0, observation_filter="ConcurrentMeanStdFilter", ), ) @@ -804,7 +804,7 @@ def test_get_filters(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( observation_filter="ConcurrentMeanStdFilter", - num_rollout_workers=0, + num_env_runners=0, ), ) self.sample_and_flush(ev) @@ -823,7 +823,7 @@ def test_sync_filter(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( observation_filter="ConcurrentMeanStdFilter", - num_rollout_workers=0, + num_env_runners=0, ), ) obs_f = self.sample_and_flush(ev) @@ -852,7 +852,7 @@ def test_extra_python_envs(self): default_policy_class=MockPolicy, config=AlgorithmConfig() .python_environment(extra_python_environs_for_driver=extra_envs) - .env_runners(num_rollout_workers=0), + .env_runners(num_env_runners=0), ) self.assertTrue("env_key_1" in os.environ) self.assertTrue("env_key_2" in os.environ) @@ -866,9 +866,7 @@ def test_no_env_seed(self): ev = RolloutWorker( env_creator=lambda _: MockVectorEnv(20, mocked_num_envs=8), default_policy_class=MockPolicy, - config=AlgorithmConfig() - .env_runners(num_rollout_workers=0) - .debugging(seed=1), + config=AlgorithmConfig().env_runners(num_env_runners=0).debugging(seed=1), ) assert not hasattr(ev.env, "seed") ev.stop() @@ -878,7 +876,7 @@ def test_multi_env_seed(self): env_creator=lambda _: MockEnv2(100), default_policy_class=MockPolicy, config=AlgorithmConfig() - .env_runners(num_envs_per_env_runner=3, num_rollout_workers=0) + .env_runners(num_envs_per_env_runner=3, num_env_runners=0) .debugging(seed=1), ) # Make sure we can properly sample from the wrapped env. @@ -913,7 +911,7 @@ def step(self, action_dict): env_creator=lambda _: MockMultiAgentEnv(), default_policy_class=MockPolicy, config=AlgorithmConfig() - .env_runners(num_envs_per_env_runner=3, num_rollout_workers=0) + .env_runners(num_envs_per_env_runner=3, num_env_runners=0) .multi_agent(policies={"policy_1", "policy_2"}) .debugging(seed=1), ) @@ -931,7 +929,7 @@ def test_wrap_multi_agent_env(self): config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, ), ) # Make sure we can properly sample from the wrapped env. @@ -963,7 +961,7 @@ def step(self, action): config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, ), ) batch = ev.sample() @@ -978,7 +976,7 @@ def step(self, action): config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, ), ) batch = ev.sample() diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 1552c059214c..3d965f7c99a7 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -107,7 +107,7 @@ def test_traj_view_lstm_prev_actions_and_rewards(self): # and Learner API. config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("CartPole-v1") # Activate LSTM + prev-action + rewards. .training( @@ -187,7 +187,7 @@ def test_traj_view_attention_net(self): config = ( ppo.PPOConfig() # Batch-norm models have not been migrated to the RL Module API yet. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment( "ray.rllib.examples.envs.classes.debug_counter_env.DebugCounterEnv", env_config={"config": {"start_at_t": 1}}, # first obs is [1.0] @@ -229,7 +229,7 @@ def test_traj_view_next_action(self): action_space = Discrete(2) config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework("torch") .env_runners(rollout_fragment_length=200, num_env_runners=0) ) @@ -307,7 +307,7 @@ def policy_fn(agent_id, episode, worker, **kwargs): config = ( ppo.PPOConfig() # The Policy used to be passed in, now we have to pass in the RLModuleSpecs - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .framework("torch") .multi_agent(policies=policies, policy_mapping_fn=policy_fn) .training( @@ -332,7 +332,7 @@ def test_counting_by_agent_steps(self): config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) # Env setup. .environment(MultiAgentPendulum, env_config={"num_agents": num_agents}) .env_runners(num_env_runners=2, rollout_fragment_length=21) diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index a1a02a5651d1..73f6a3f4ed50 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -138,9 +138,21 @@ def __init__( } # Set the EnvRunner subclass to be used as "workers". Default: RolloutWorker. - self.env_runner_cls = ( - RolloutWorker if config.env_runner_cls is None else config.env_runner_cls - ) + self.env_runner_cls = config.env_runner_cls + if self.env_runner_cls is None: + if config.enable_env_runner_and_connector_v2: + if config.is_multi_agent(): + from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner + + self.env_runner_cls = MultiAgentEnvRunner + else: + from ray.rllib.env.single_agent_env_runner import ( + SingleAgentEnvRunner, + ) + + self.env_runner_cls = SingleAgentEnvRunner + else: + self.env_runner_cls = RolloutWorker self._cls = ray.remote(**self._remote_args)(self.env_runner_cls).remote self._logdir = logdir @@ -239,7 +251,7 @@ def _setup( if ( local_worker and self.__worker_manager.num_actors() > 0 - and not config.uses_new_env_runners + and not config.enable_env_runner_and_connector_v2 and not config.create_env_on_local_worker and (not config.observation_space or not config.action_space) ): diff --git a/rllib/examples/_docs/rllib_on_rllib_readme.py b/rllib/examples/_docs/rllib_on_rllib_readme.py index 1c3c2d330b4c..d0e9be49a97d 100644 --- a/rllib/examples/_docs/rllib_on_rllib_readme.py +++ b/rllib/examples/_docs/rllib_on_rllib_readme.py @@ -59,7 +59,7 @@ def step(self, action): env_config={"parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1,))}, ) # Parallelize environment rollouts. - .env_runners(num_rollout_workers=3) + .env_runners(num_env_runners=3) ) algo = config.build() diff --git a/rllib/examples/_old_api_stack/complex_struct_space.py b/rllib/examples/_old_api_stack/complex_struct_space.py index c2e45b406c7b..075b8831d04c 100644 --- a/rllib/examples/_old_api_stack/complex_struct_space.py +++ b/rllib/examples/_old_api_stack/complex_struct_space.py @@ -40,7 +40,7 @@ PPOConfig() .environment(SimpleRPG) .framework(args.framework) - .env_runners(rollout_fragment_length=1, num_rollout_workers=0) + .env_runners(rollout_fragment_length=1, num_env_runners=0) .training(train_batch_size=2, model={"custom_model": "my_model"}) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) diff --git a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py index ae191e78513a..35d151341fcb 100644 --- a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py @@ -23,8 +23,8 @@ def _policy_mapping_fn(*args, **kwargs): # and use a TF policy in a Torch training stack. .framework("tf2") .env_runners( - num_rollout_workers=1, - num_envs_per_worker=5, + num_env_runners=1, + num_envs_per_env_runner=5, # We will be restoring a TF2 policy. # So tell the RolloutWorkers to enable TF eager exec as well, even if # framework is set to torch. diff --git a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py index cae3c2493c82..38531c626b5f 100644 --- a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py @@ -73,8 +73,8 @@ def main(checkpoint_dir): .framework("torch") .callbacks(partial(AddPolicyCallback, checkpoint_dir)) .env_runners( - num_rollout_workers=1, - num_envs_per_worker=5, + num_env_runners=1, + num_envs_per_env_runner=5, # We will be restoring a TF2 policy. # So tell the RolloutWorkers to enable TF eager exec as well, even if # framework is set to torch. diff --git a/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py b/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py index 156ead8f3341..47ce9b92c884 100644 --- a/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py +++ b/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py @@ -37,7 +37,7 @@ def is_recurrent(self): return True def get_initial_state(self): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # convert the tree of tensors to a tree to numpy arrays return tree.map_structure( lambda s: convert_to_numpy(s), self.model.get_initial_state() diff --git a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py index a46d6b628133..77b47fb23083 100644 --- a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py +++ b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py @@ -120,11 +120,11 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs) -> None: # through them in parallel. remote_worker_envs=True, # How many RolloutWorkers (each with n environment copies: - # `num_envs_per_worker`)? - num_rollout_workers=args.num_workers, + # `num_envs_per_env_runner`)? + num_env_runners=args.num_workers, # This setting should not really matter as it does not affect the # number of GPUs reserved for each worker. - num_envs_per_worker=args.num_envs_per_worker, + num_envs_per_env_runner=args.num_envs_per_env_runner, ) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) diff --git a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py index 52b77840613e..014c6e9fc948 100644 --- a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py @@ -108,7 +108,7 @@ def default_resource_request( { # Different bundle (meaning: possibly different node) # for your n "remote" envs (set remote_worker_envs=True). - "CPU": cf.num_envs_per_worker, + "CPU": cf.num_envs_per_env_runner, }, ], strategy=cf.placement_strategy, @@ -128,12 +128,12 @@ def default_resource_request( # Force sub-envs to be ray.actor.ActorHandles, so we can step # through them in parallel. remote_worker_envs=True, - num_envs_per_worker=args.num_envs_per_worker, + num_envs_per_env_runner=args.num_envs_per_worker, # Use a single worker (however, with n parallelized remote envs, maybe # even running on another node). # Action computations occur on the "main" (GPU?) node, while # the envs run on one or more CPU node(s). - num_rollout_workers=0, + num_env_runners=0, ) .resources( # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. diff --git a/rllib/examples/action_masking.py b/rllib/examples/action_masking.py index 42d2613106d0..c9bab618fdf1 100644 --- a/rllib/examples/action_masking.py +++ b/rllib/examples/action_masking.py @@ -109,10 +109,8 @@ def get_cli_args(): ) # We need to disable preprocessing of observations, because preprocessing # would flatten the observation dict of the environment. - .experimental( - _enable_new_api_stack=True, - _disable_preprocessor_api=True, - ) + .api_stack(enable_rl_module_and_learner=True) + .experimental(_disable_preprocessor_api=True) .framework(args.framework) .resources( # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index c083678f83b7..5bfe1dd513f7 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -192,12 +192,11 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( AlgorithmConfig() - # TODO (Kourosh): Migrate this to the new RLModule / Learner API. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("multi_agent_cartpole") .framework("torch" if args.torch else "tf") .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .env_runners(num_rollout_workers=0, rollout_fragment_length=50) + .env_runners(num_env_runners=0, rollout_fragment_length=50) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) .reporting(metrics_num_episodes_for_smoothing=30) diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index f6245211847a..6cf31f0bae24 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -140,7 +140,7 @@ def get_cli_args(): get_trainable_cls(args.run) .get_default_config() # Batch-norm models have not been migrated to the RL Module API yet. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment(CorrelatedActionsEnv) .framework(args.framework) .training(gamma=0.5) diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index 4f94994d8586..beebdb79f773 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -44,13 +44,13 @@ def _get_encoder_config( # Create a generic config with our enhanced Catalog ppo_config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec( catalog_class=MobileNetEnhancedPPOCatalog ) ) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! diff --git a/rllib/examples/checkpoints/onnx_tf.py b/rllib/examples/checkpoints/onnx_tf.py index 0093afd0fd9e..f63847f117f8 100644 --- a/rllib/examples/checkpoints/onnx_tf.py +++ b/rllib/examples/checkpoints/onnx_tf.py @@ -25,8 +25,8 @@ config = ( ppo.PPOConfig() # ONNX is not supported by RLModule API yet. - .experimental(_enable_new_api_stack=False) - .env_runners(num_rollout_workers=1) + .api_stack(enable_rl_module_and_learner=False) + .env_runners(num_env_runners=1) .framework(args.framework) ) diff --git a/rllib/examples/checkpoints/onnx_torch.py b/rllib/examples/checkpoints/onnx_torch.py index 008be01378a7..77a1ffb5f28a 100644 --- a/rllib/examples/checkpoints/onnx_torch.py +++ b/rllib/examples/checkpoints/onnx_torch.py @@ -14,8 +14,8 @@ config = ( ppo.PPOConfig() # ONNX is not supported by RLModule API yet. - .experimental(_enable_new_api_stack=False) - .env_runners(num_rollout_workers=1) + .api_stack(enable_rl_module_and_learner=False) + .env_runners(num_env_runners=1) .framework("torch") ) diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 2f5dd21f3c4a..b0cb6865e98a 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -214,7 +214,7 @@ def on_train_result( model={"vf_share_layers": True}, ) .env_runners( - num_envs_per_worker=5, + num_envs_per_env_runner=5, env_to_module_connector=lambda env: [ AddObservationsFromEpisodesToBatch(), FlattenObservations(), diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index 8b819941c98b..12bccb28f508 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -40,8 +40,8 @@ ) .framework(args.framework) .env_runners( - num_rollout_workers=1, - num_envs_per_worker=2, + num_env_runners=1, + num_envs_per_env_runner=2, rollout_fragment_length=50, ) .resources( @@ -89,7 +89,7 @@ check(results1["hist_stats"], results2["hist_stats"]) # As well as training behavior (minibatch sequence during SGD # iterations). - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: check( results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID], results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID], diff --git a/rllib/examples/envs/env_rendering_and_recording.py b/rllib/examples/envs/env_rendering_and_recording.py index de915f8be8e5..e5fcd4891e59 100644 --- a/rllib/examples/envs/env_rendering_and_recording.py +++ b/rllib/examples/envs/env_rendering_and_recording.py @@ -101,7 +101,7 @@ def render(self, mode="rgb"): ) .framework(args.framework) # Use a vectorized env with 2 sub-envs. - .env_runners(num_envs_per_worker=2, num_rollout_workers=1) + .env_runners(num_envs_per_env_runner=2, num_env_runners=1) .evaluation( # Evaluate once per training iteration. evaluation_interval=1, diff --git a/rllib/examples/envs/external_envs/cartpole_server.py b/rllib/examples/envs/external_envs/cartpole_server.py index 09d2ba5e8283..d6c661590387 100755 --- a/rllib/examples/envs/external_envs/cartpole_server.py +++ b/rllib/examples/envs/external_envs/cartpole_server.py @@ -174,7 +174,7 @@ def _input(ioctx): .offline_data(input_=_input) # Use n worker processes to listen on different ports. .env_runners( - num_rollout_workers=args.num_workers, + num_env_runners=args.num_workers, # Connectors are not compatible with the external env. enable_connectors=False, ) @@ -186,7 +186,7 @@ def _input(ioctx): # Disable RLModules because they need connectors # TODO (Sven): Deprecate ExternalEnv (via EnvRunner path) and reenable connectors # and RL Modules here. - config.experimental(_enable_new_api_stack=False) + config.api_stack(enable_rl_module_and_learner=False) # DQN. if args.run == "DQN" or args.run == "APEX" or args.run == "R2D2": diff --git a/rllib/examples/envs/external_envs/unity3d_server.py b/rllib/examples/envs/external_envs/unity3d_server.py index 00129aea074b..e5b17ca1d16f 100755 --- a/rllib/examples/envs/external_envs/unity3d_server.py +++ b/rllib/examples/envs/external_envs/unity3d_server.py @@ -133,7 +133,7 @@ def _input(ioctx): .framework(args.framework) # Use n worker processes to listen on different ports. .env_runners( - num_rollout_workers=args.num_workers, + num_env_runners=args.num_workers, rollout_fragment_length=20, enable_connectors=False, ) diff --git a/rllib/examples/envs/greyscale_env.py b/rllib/examples/envs/greyscale_env.py index 480b0c77c41d..9278b53ec2a9 100644 --- a/rllib/examples/envs/greyscale_env.py +++ b/rllib/examples/envs/greyscale_env.py @@ -83,8 +83,8 @@ def env_creator(config): PPOConfig() .environment("pistonball", env_config={"local_ratio": 0.5}, clip_rewards=True) .env_runners( - num_rollout_workers=15 if not args.as_test else 2, - num_envs_per_worker=1, + num_env_runners=15 if not args.as_test else 2, + num_envs_per_env_runner=1, observation_filter="NoFilter", rollout_fragment_length="auto", ) diff --git a/rllib/examples/envs/unity3d_env_local.py b/rllib/examples/envs/unity3d_env_local.py index 46bc9f95af5b..bfe4a4a4a165 100644 --- a/rllib/examples/envs/unity3d_env_local.py +++ b/rllib/examples/envs/unity3d_env_local.py @@ -132,7 +132,7 @@ # For running in editor, force to use just one Worker (we only have # one Unity running)! .env_runners( - num_rollout_workers=args.num_workers if args.file_name else 0, + num_env_runners=args.num_workers if args.file_name else 0, rollout_fragment_length=200, ) .training( diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py index d5d035282a03..d1e45bed5624 100644 --- a/rllib/examples/evaluation/evaluation_parallel_to_training.py +++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py @@ -138,7 +138,7 @@ def on_train_result(self, *, algorithm: Algorithm, result: ResultDict, **kwargs) # `evaluation_num_env_runners` or `evaluation_parallel_to_training`). if eval_env_runner_results and NUM_EPISODES in eval_env_runner_results: num_episodes_done = eval_env_runner_results[NUM_EPISODES] - if algorithm.config.uses_new_env_runners: + if algorithm.config.enable_env_runner_and_connector_v2: num_timesteps_reported = eval_env_runner_results[NUM_ENV_STEPS_SAMPLED] else: num_timesteps_reported = eval_results["timesteps_this_iter"] diff --git a/rllib/examples/gpus/fractional_gpus.py b/rllib/examples/gpus/fractional_gpus.py index 6b70fd621b34..ad87ba866e46 100644 --- a/rllib/examples/gpus/fractional_gpus.py +++ b/rllib/examples/gpus/fractional_gpus.py @@ -95,12 +95,12 @@ num_gpus_per_worker=args.num_gpus_per_worker, ) # How many RolloutWorkers (each with n environment copies: - # `num_envs_per_worker`)? + # `num_envs_per_env_runner`)? .env_runners( - num_rollout_workers=args.num_workers, + num_env_runners=args.num_workers, # This setting should not really matter as it does not affect the # number of GPUs reserved for each worker. - num_envs_per_worker=args.num_envs_per_worker, + num_envs_per_env_runner=args.num_envs_per_worker, ) # 4 tune trials altogether. .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) diff --git a/rllib/examples/hierarchical/hierarchical_training.py b/rllib/examples/hierarchical/hierarchical_training.py index c62ee4b73ce3..76f23907b652 100644 --- a/rllib/examples/hierarchical/hierarchical_training.py +++ b/rllib/examples/hierarchical/hierarchical_training.py @@ -86,7 +86,7 @@ param_space=( PPOConfig() .environment(WindyMazeEnv) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .framework(args.framework) ).to_dict(), ).fit() @@ -103,7 +103,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): PPOConfig() .environment(HierarchicalWindyMazeEnv) .framework(args.framework) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training(entropy_coeff=0.01) .multi_agent( policies={ diff --git a/rllib/examples/inference/policy_inference_after_training_with_attention.py b/rllib/examples/inference/policy_inference_after_training_with_attention.py index 5086831bcda2..b42a05316d9f 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_attention.py +++ b/rllib/examples/inference/policy_inference_after_training_with_attention.py @@ -78,7 +78,7 @@ get_trainable_cls(args.run) .get_default_config() # TODO (Kourosh): Enable when Attentions are supported. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("FrozenLake-v1") # Run with tracing enabled for tf2? .framework(args.framework) diff --git a/rllib/examples/inference/policy_inference_after_training_with_lstm.py b/rllib/examples/inference/policy_inference_after_training_with_lstm.py index 19aabe0dbc9f..2a428bdcfb26 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_lstm.py +++ b/rllib/examples/inference/policy_inference_after_training_with_lstm.py @@ -130,7 +130,7 @@ # Set LSTM's initial internal state. lstm_cell_size = config["model"]["lstm_cell_size"] # range(2) b/c h- and c-states of the LSTM. - if algo.config._enable_new_api_stack: + if algo.config.enable_rl_module_and_learner: init_state = state = algo.get_policy().model.get_initial_state() else: init_state = state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)] diff --git a/rllib/examples/learners/ppo_load_rl_modules.py b/rllib/examples/learners/ppo_load_rl_modules.py index 6b87ba4d50a1..ef8ebf7684f7 100644 --- a/rllib/examples/learners/ppo_load_rl_modules.py +++ b/rllib/examples/learners/ppo_load_rl_modules.py @@ -58,7 +58,7 @@ def _parse_args(): # train a PPO algorithm with the loaded module config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework(args.framework) .rl_module(rl_module_spec=module_to_load_spec) .environment("CartPole-v1") diff --git a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py index 22830e42b39e..ddfa9a0a1164 100644 --- a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py +++ b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py @@ -114,7 +114,7 @@ def train_ppo_agent_from_checkpointed_module( """ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module(rl_module_spec=module_spec_from_ckpt) .environment(GYM_ENV_NAME) .training( diff --git a/rllib/examples/multi_agent/different_spaces_for_agents.py b/rllib/examples/multi_agent/different_spaces_for_agents.py index ffbc45255380..ec543de185fc 100644 --- a/rllib/examples/multi_agent/different_spaces_for_agents.py +++ b/rllib/examples/multi_agent/different_spaces_for_agents.py @@ -33,8 +33,8 @@ class BasicMultiAgentMultiSpaces(MultiAgentEnv): """A simple multi-agent example environment where agents have different spaces. - agent0: obs=(10,), act=Discrete(2) - agent1: obs=(20,), act=Discrete(3) + agent0: obs=Box(10,), act=Discrete(2) + agent1: obs=Box(20,), act=Discrete(3) The logic of the env doesn't really matter for this example. The point of this env is to show how to use multi-agent envs, in which the different agents utilize diff --git a/rllib/examples/multi_agent/multi_agent_cartpole.py b/rllib/examples/multi_agent/multi_agent_cartpole.py index b26f9b6ecb1b..4bdf019f10b1 100644 --- a/rllib/examples/multi_agent/multi_agent_cartpole.py +++ b/rllib/examples/multi_agent/multi_agent_cartpole.py @@ -51,7 +51,7 @@ # TODO (sven): MAEnvRunner does not support vectorized envs yet # due to gym's env checkers and non-compatability with RLlib's # MultiAgentEnv API. - num_envs_per_worker=1 + num_envs_per_env_runner=1 if args.num_agents > 0 else 20, ) diff --git a/rllib/examples/multi_agent/multi_agent_pendulum.py b/rllib/examples/multi_agent/multi_agent_pendulum.py index 00e73bafd3c5..757bed5cb76e 100644 --- a/rllib/examples/multi_agent/multi_agent_pendulum.py +++ b/rllib/examples/multi_agent/multi_agent_pendulum.py @@ -47,7 +47,7 @@ get_trainable_cls(args.algo) .get_default_config() .environment("env" if args.num_agents > 0 else "Pendulum-v1") - .env_runners(num_rollout_workers=4) + .env_runners(num_env_runners=4) .training( train_batch_size_per_learner=512, mini_batch_size_per_learner=64, diff --git a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py index 1792f280b12f..046613a49a27 100644 --- a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py @@ -36,7 +36,6 @@ import ray from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv from ray.rllib.examples.multi_agent.utils import ( @@ -160,7 +159,10 @@ def _get_multi_agent(): get_trainable_cls(args.algo) .get_default_config() # Use new API stack ... - .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) .environment("open_spiel_env") .framework(args.framework) # Set up the main piece in this experiment: The league-bases self-play @@ -175,13 +177,8 @@ def _get_multi_agent(): ) ) .env_runners( - num_rollout_workers=args.num_env_runners, - num_envs_per_worker=1 if args.enable_new_api_stack else 5, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None if not args.enable_new_api_stack else MultiAgentEnvRunner - ), + num_env_runners=args.num_env_runners, + num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, ) .resources( num_learner_workers=args.num_gpus, @@ -255,7 +252,7 @@ def _get_multi_agent(): action = ask_user_for_action(time_step) else: obs = np.array(time_step.observations["info_state"][player_id]) - if config.uses_new_env_runners: + if config.enable_env_runner_and_connector_v2: action = algo.workers.local_worker().module.forward_inference( {"obs": obs} ) diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py index fbca0b75ac9d..c6cccbbb2c28 100644 --- a/rllib/examples/multi_agent/self_play_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -24,7 +24,6 @@ from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule @@ -106,7 +105,10 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( get_trainable_cls(args.algo) .get_default_config() - .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) .environment("open_spiel_env") .framework(args.framework) # Set up the main piece in this experiment: The league-bases self-play @@ -123,13 +125,8 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): ) ) .env_runners( - num_rollout_workers=args.num_env_runners, - num_envs_per_worker=1 if args.enable_new_api_stack else 5, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None if not args.enable_new_api_stack else MultiAgentEnvRunner - ), + num_env_runners=args.num_env_runners, + num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, ) .resources( num_learner_workers=args.num_gpus, diff --git a/rllib/examples/multi_agent/two_algorithms.py b/rllib/examples/multi_agent/two_algorithms.py index 43e75b4b414f..8cffdaf4173f 100644 --- a/rllib/examples/multi_agent/two_algorithms.py +++ b/rllib/examples/multi_agent/two_algorithms.py @@ -82,7 +82,7 @@ def select_policy(algorithm, framework): # Construct two independent Algorithm configs ppo_config = ( PPOConfig() - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("multi_agent_cartpole") .framework(args.framework) # disable filters, otherwise we would need to synchronize those diff --git a/rllib/examples/offline_rl/offline_rl.py b/rllib/examples/offline_rl/offline_rl.py index 4d4f0803cf45..5ad0bef527d3 100644 --- a/rllib/examples/offline_rl/offline_rl.py +++ b/rllib/examples/offline_rl/offline_rl.py @@ -53,7 +53,7 @@ config = ( cql.CQLConfig() .framework(framework="torch") - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training( n_step=3, bc_iters=0, diff --git a/rllib/examples/ray_serve/ray_serve_with_rllib.py b/rllib/examples/ray_serve/ray_serve_with_rllib.py index be1432acadf1..6001865a5544 100644 --- a/rllib/examples/ray_serve/ray_serve_with_rllib.py +++ b/rllib/examples/ray_serve/ray_serve_with_rllib.py @@ -68,7 +68,9 @@ def kill_proc(proc): # Config for the served RLlib RLModule/Algorithm. config = ( - PPOConfig().experimental(_enable_new_api_stack=True).environment("CartPole-v1") + PPOConfig() + .api_stack(enable_rl_module_and_learner=True) + .environment("CartPole-v1") ) # Train the Algorithm for some time, then save it and get the checkpoint path. diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index c905c6fdce27..dbb393d290dc 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -43,7 +43,6 @@ import numpy as np from ray import train, tune from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.utils.framework import try_import_torch torch, _ = try_import_torch() @@ -58,7 +57,7 @@ def my_experiment(config: Dict): config = ( PPOConfig() .update_from_dict(config) - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") ) @@ -94,7 +93,7 @@ def my_experiment(config: Dict): # Set the number of EnvRunners for collecting training data to 0 (local # worker only). - config.env_runners(num_rollout_workers=0) + config.env_runners(num_env_runners=0) eval_algo = config.build() # Load state from the low-lr algo into this one. @@ -155,12 +154,12 @@ def my_experiment(config: Dict): if __name__ == "__main__": base_config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) - .environment("CartPole-v1") - .env_runners( - num_rollout_workers=0, - env_runner_cls=SingleAgentEnvRunner, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .environment("CartPole-v1") + .env_runners(num_env_runners=0) ) # Convert to a plain dict for Tune. Note that this is usually not needed, you can # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object. diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index 88f8504f19cd..9ec3af15a2ae 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -52,7 +52,6 @@ from ray import air, tune from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune.logger import Logger, LegacyLoggerCallback @@ -83,8 +82,10 @@ def flush(self): if __name__ == "__main__": config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) - .env_runners(env_runner_cls=SingleAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") # Setting up a custom logger config. # ---------------------------------- diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py index f0c4c76f14f1..57bb64dff8f6 100644 --- a/rllib/examples/ray_tune/custom_progress_reporter.py +++ b/rllib/examples/ray_tune/custom_progress_reporter.py @@ -45,7 +45,6 @@ """ from ray import air, tune from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole @@ -89,8 +88,10 @@ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) - .env_runners(env_runner_cls=MultiAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("env") .multi_agent( # Define 3 policies. Note that in our simple setup, they are all configured diff --git a/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/rllib/examples/rl_modules/classes/mobilenet_rlm.py index b3827fd010bc..f31ae4f1c6d4 100644 --- a/rllib/examples/rl_modules/classes/mobilenet_rlm.py +++ b/rllib/examples/rl_modules/classes/mobilenet_rlm.py @@ -56,7 +56,7 @@ def setup(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec(module_class=MobileNetTorchPPORLModule) ) @@ -73,7 +73,7 @@ def setup(self): ), }, ) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index cfe8cec79155..05d736945ed7 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -39,7 +39,7 @@ def tearDownClass(cls) -> None: def test_rlms_and_preprocessing(self): config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework("tf2") .environment( env="ray.rllib.examples.envs.classes.random_env.RandomEnv", diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index 94f716530354..e3d7f93bf67c 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -180,7 +180,7 @@ def compute_actions_from_input_dict( # Create a traced version of `self._compute_actions_helper`. if self._traced_compute_actions_helper is False and not self._no_tracing: - if self.config.get("_enable_new_api_stack"): + if self.config.get("enable_rl_module_and_learner"): self._compute_actions_helper_rl_module_explore = ( _convert_eager_inputs( tf.function( @@ -442,7 +442,7 @@ def __init__(self, observation_space, action_space, config): # action). self._lock = threading.RLock() - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # Maybe update view_requirements, e.g. for recurrent case. self.view_requirements = self.model.update_default_view_requirements( self.view_requirements @@ -754,7 +754,10 @@ def get_state(self) -> PolicyState: if self._optimizer and len(self._optimizer.variables()) > 0: state["_optimizer_variables"] = self._optimizer.variables() # Add exploration state. - if not self.config.get("_enable_new_api_stack", False) and self.exploration: + if ( + not self.config.get("enable_rl_module_and_learner", False) + and self.exploration + ): # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py index cc978c5ac677..1a14520f1d11 100644 --- a/rllib/policy/eager_tf_policy_v2.py +++ b/rllib/policy/eager_tf_policy_v2.py @@ -112,7 +112,7 @@ def __init__( # If using default make_model(), dist_class will get updated when # the model is created next. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self.model = self.make_rl_module() self.dist_class = None else: @@ -121,7 +121,7 @@ def __init__( self._init_view_requirements() - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self.exploration = None else: self.exploration = self._create_exploration() @@ -158,10 +158,13 @@ def enable_eager_execution_if_necessary(): @override(Policy) def maybe_remove_time_dimension(self, input_dict: Dict[str, TensorType]): assert self.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ), "This is a helper method for the new learner API." - if self.config.get("_enable_new_api_stack", False) and self.model.is_stateful(): + if ( + self.config.get("enable_rl_module_and_learner", False) + and self.model.is_stateful() + ): # Note that this is a temporary workaround to fit the old sampling stack # to RL Modules. ret = {} @@ -213,15 +216,15 @@ def loss( Returns: A single loss tensor or a list of loss tensors. """ - # Under the new _enable_new_api_stack the loss function still gets called in - # order to initialize the view requirements of the sample batches that are + # Under the new enable_rl_module_and_learner the loss function still gets called + # in order to initialize the view requirements of the sample batches that are # returned by the sampler. In this case, we don't actually want to compute any # loss, however # if we access the keys that are needed for a forward_train pass, then the # sampler will include those keys in the sample batches it returns. This means # that the correct sample batch keys will be available when using the learner # group API. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): for k in model.input_specs_train(): train_batch[k] return None @@ -442,7 +445,7 @@ def _init_dist_class(self): return dist_class def _init_view_requirements(self): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # Maybe update view_requirements, e.g. for recurrent case. self.view_requirements = self.model.update_default_view_requirements( self.view_requirements @@ -458,7 +461,7 @@ def _init_view_requirements(self): self.view_requirements[SampleBatch.INFOS].used_for_training = False def maybe_initialize_optimizer_and_loss(self): - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): optimizers = force_list(self.optimizer()) if self.exploration: # Policies with RLModules don't have an exploration object. @@ -509,7 +512,7 @@ def compute_actions_from_input_dict( timestep=timestep, explore=explore, tf_sess=self.get_session() ) - if self.config.get("_enable_new_api_stack"): + if self.config.get("enable_rl_module_and_learner"): # For recurrent models, we need to add a time dimension. seq_lens = input_dict.get("seq_lens", None) if seq_lens is None: @@ -627,7 +630,7 @@ def compute_log_likelihoods( action_dist = self.dist_class(dist_inputs, self.model) # Default log-likelihood calculation. else: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): if in_training: output = self.model.forward_train(input_batch) action_dist_cls = self.model.get_train_action_dist_cls() @@ -783,7 +786,7 @@ def get_state(self) -> PolicyState: state["global_timestep"] = state["global_timestep"].numpy() # In the new Learner API stack, the optimizers live in the learner. state["_optimizer_variables"] = [] - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): if self._optimizer and len(self._optimizer.variables()) > 0: state["_optimizer_variables"] = self._optimizer.variables() diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index c0e78b5a10c8..4b348771ec8b 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -401,7 +401,7 @@ def make_rl_module(self) -> "RLModule": """Returns the RL Module (only for when RLModule API is enabled.) If RLModule API is enabled - (self.config.experimental(_enable_new_api_stack=True), this method should be + (self.config.api_stack(enable_rl_module_and_learner=True), this method should be implemented and should return the RLModule instance to use for this Policy. Otherwise, RLlib will error out. """ @@ -521,7 +521,7 @@ def compute_single_action( if input_dict is None: input_dict = {SampleBatch.OBS: obs} if state is not None: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): input_dict["state_in"] = state else: for i, s in enumerate(state): @@ -1255,7 +1255,7 @@ def _get_num_gpus_for_policy(self) -> int: # If we are on the new RLModule/Learner stack, `num_gpus` is deprecated. # so use `num_gpus_per_worker` for policy sampling # we need this .get() syntax here to ensure backwards compatibility. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): num_gpus = self.config["num_gpus_per_worker"] else: # If head node, take num_gpus. @@ -1391,12 +1391,12 @@ def _initialize_loss_from_dummy_batch( self._lazy_tensor_dict(self._dummy_batch) # With RL Modules you want the explore flag to be True for initialization # of the tensors and placeholder you'd need for training. - explore = self.config.get("_enable_new_api_stack", False) + explore = self.config.get("enable_rl_module_and_learner", False) actions, state_outs, extra_outs = self.compute_actions_from_input_dict( self._dummy_batch, explore=explore ) - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): for key, view_req in self.view_requirements.items(): if key not in self._dummy_batch.accessed_keys: view_req.used_for_compute_actions = False @@ -1446,7 +1446,7 @@ def _initialize_loss_from_dummy_batch( seq_lens = None if state_outs: B = 4 # For RNNs, have B=4, T=[depends on sample_batch_size] - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): sub_batch = postprocessed_batch[:B] postprocessed_batch["state_in"] = sub_batch["state_in"] postprocessed_batch["state_out"] = sub_batch["state_out"] @@ -1466,7 +1466,7 @@ def _initialize_loss_from_dummy_batch( seq_lens = np.array([seq_len for _ in range(B)], dtype=np.int32) postprocessed_batch[SampleBatch.SEQ_LENS] = seq_lens - if not self.config.get("_enable_new_api_stack"): + if not self.config.get("enable_rl_module_and_learner"): # Switch on lazy to-tensor conversion on `postprocessed_batch`. train_batch = self._lazy_tensor_dict(postprocessed_batch) # Calling loss, so set `is_training` to True. @@ -1506,7 +1506,7 @@ def _initialize_loss_from_dummy_batch( # Add new columns automatically to view-reqs. if ( - not self.config.get("_enable_new_api_stack") + not self.config.get("enable_rl_module_and_learner") and auto_remove_unneeded_view_reqs ): # Add those needed for postprocessing and training. @@ -1607,7 +1607,7 @@ def maybe_add_time_dimension( # We need to check for hasattr(self, "model") because a dummy Policy may not # have a model. if ( - self.config.get("_enable_new_api_stack", False) + self.config.get("enable_rl_module_and_learner", False) and hasattr(self, "model") and self.model.is_stateful() ): diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index 5277bc5c87b0..c13d0bbfd561 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -124,7 +124,7 @@ def do_test_log_likelihood( # The expected logp computation logic is overfitted to the ModelV2 # stack and does not generalize to RLModule API. - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: expected_logp = _get_expected_logp( fw, vars, obs_batch, a, layer_key, logp_func ) @@ -142,7 +142,7 @@ def do_test_log_likelihood( in_training=False, ) - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: check(np.exp(logp), expected_prob, atol=0.2) diff --git a/rllib/policy/tests/test_export_checkpoint_and_model.py b/rllib/policy/tests/test_export_checkpoint_and_model.py index 3a54e60d605e..32eaa654e00f 100644 --- a/rllib/policy/tests/test_export_checkpoint_and_model.py +++ b/rllib/policy/tests/test_export_checkpoint_and_model.py @@ -30,7 +30,7 @@ def export_test( cls = get_trainable_cls(alg_name) config = cls.get_default_config() if alg_name in RLMODULE_SUPPORTED_ALGOS: - config = config.experimental(_enable_new_api_stack=False) + config = config.api_stack(enable_rl_module_and_learner=False) config.framework(framework) # Switch on saving native DL-framework (tf, torch) model files. config.checkpointing(export_native_model_files=True) diff --git a/rllib/policy/tests/test_policy.py b/rllib/policy/tests/test_policy.py index dfd182dfe37f..6bd09c6e8ff3 100644 --- a/rllib/policy/tests/test_policy.py +++ b/rllib/policy/tests/test_policy.py @@ -34,7 +34,7 @@ def test_policy_get_and_set_state(self): # Make sure everything is the same. # This is only supported without RLModule API. See AlgorithmConfig for # more info. - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: check(state1["_exploration_state"], state3["_exploration_state"]) check(state1["global_timestep"], state3["global_timestep"]) check(state1["weights"], state3["weights"]) @@ -47,7 +47,7 @@ def test_policy_get_and_set_state(self): state4 = policy_restored_from_scratch.get_state() # This is only supported without RLModule API. See AlgorithmConfig for # more info. - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: check(state3["_exploration_state"], state4["_exploration_state"]) check(state3["global_timestep"], state4["global_timestep"]) # For tf static graph, the new model has different layer names diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py index 9a1869a4b768..e4e88aa00785 100644 --- a/rllib/policy/tf_mixins.py +++ b/rllib/policy/tf_mixins.py @@ -34,7 +34,9 @@ def __init__(self, lr, lr_schedule): self._lr_schedule = None # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. - if lr_schedule is None or self.config.get("_enable_new_api_stack", False): + if lr_schedule is None or self.config.get( + "enable_rl_module_and_learner", False + ): self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False) else: self._lr_schedule = PiecewiseSchedule( @@ -81,7 +83,7 @@ def __init__(self, entropy_coeff, entropy_coeff_schedule): # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. if entropy_coeff_schedule is None or ( - self.config.get("_enable_new_api_stack", False) + self.config.get("enable_rl_module_and_learner", False) ): self.entropy_coeff = get_variable( entropy_coeff, framework="tf", tf_name="entropy_coeff", trainable=False @@ -214,7 +216,7 @@ class TargetNetworkMixin: """ def __init__(self): - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): model_vars = self.model.trainable_variables() target_model_vars = self.target_model.trainable_variables() @@ -244,7 +246,7 @@ def update_target_fn(tau): @property def q_func_vars(self): if not hasattr(self, "_q_func_vars"): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self._q_func_vars = self.model.variables else: self._q_func_vars = self.model.variables() @@ -253,7 +255,7 @@ def q_func_vars(self): @property def target_q_func_vars(self): if not hasattr(self, "_target_q_func_vars"): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self._target_q_func_vars = self.target_model.variables else: self._target_q_func_vars = self.target_model.variables() @@ -265,7 +267,7 @@ def update_target(self, tau: int = None) -> None: @override(TFPolicy) def variables(self) -> List[TensorType]: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): return self.model.variables else: return self.model.variables() @@ -277,7 +279,7 @@ def set_weights(self, weights): EagerTFPolicyV2.set_weights(self, weights) elif isinstance(self, EagerTFPolicy): # Handle TF2 policies. EagerTFPolicy.set_weights(self, weights) - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): self.update_target(self.config.get("tau", 1.0)) diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py index 7a11ec13a408..c2343f8c315c 100644 --- a/rllib/policy/torch_mixins.py +++ b/rllib/policy/torch_mixins.py @@ -35,7 +35,7 @@ def __init__(self, lr, lr_schedule, lr2=None, lr2_schedule=None): @override(Policy) def on_global_var_update(self, global_vars): super().on_global_var_update(global_vars) - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): if self._lr_schedule: self.cur_lr = self._lr_schedule.value(global_vars["timestep"]) for opt in self._optimizers: @@ -58,7 +58,7 @@ def __init__(self, entropy_coeff, entropy_coeff_schedule): # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. if entropy_coeff_schedule is None or ( - self.config.get("_enable_new_api_stack", False) + self.config.get("enable_rl_module_and_learner", False) ): self.entropy_coeff = entropy_coeff else: @@ -210,7 +210,7 @@ def update_target(self, tau=None): # Support partial (soft) synching. # If tau == 1.0: Full sync from Q-model to target Q-model. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): target_current_network_pairs = self.model.get_target_network_pairs() for target_network, current_network in target_current_network_pairs: current_state_dict = current_network.state_dict() diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 31bce76119ab..6d53b78da360 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -739,7 +739,10 @@ def get_state(self) -> PolicyState: optim_state_dict = convert_to_numpy(o.state_dict()) state["_optimizer_variables"].append(optim_state_dict) # Add exploration state. - if not self.config.get("_enable_new_api_stack", False) and self.exploration: + if ( + not self.config.get("enable_rl_module_and_learner", False) + and self.exploration + ): # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index c62d7e151965..5a52cdfd32bc 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -86,7 +86,7 @@ def __init__( super().__init__(observation_space, action_space, config) # Create model. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): model = self.make_rl_module() dist_class = None @@ -173,7 +173,7 @@ def __init__( self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(tree.flatten(self._state_inputs)) > 0 - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # Maybe update view_requirements, e.g. for recurrent case. self.view_requirements = self.model.update_default_view_requirements( self.view_requirements @@ -184,13 +184,13 @@ def __init__( # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # We don't need an exploration object with RLModules self.exploration = None else: self.exploration = self._create_exploration() - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): self._optimizers = force_list(self.optimizer()) # Backward compatibility workaround so Policy will call self.loss() @@ -250,15 +250,15 @@ def loss( Returns: Loss tensor given the input batch. """ - # Under the new _enable_new_api_stack the loss function still gets called in - # order to initialize the view requirements of the sample batches that are + # Under the new enable_rl_module_and_learner the loss function still gets called + # in order to initialize the view requirements of the sample batches that are # returned by # the sampler. In this case, we don't actually want to compute any loss, however # if we access the keys that are needed for a forward_train pass, then the # sampler will include those keys in the sample batches it returns. This means # that the correct sample batch keys will be available when using the learner # group API. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: for k in model.input_specs_train(): train_batch[k] return None @@ -327,10 +327,13 @@ def make_model(self) -> ModelV2: @override(Policy) def maybe_remove_time_dimension(self, input_dict: Dict[str, TensorType]): assert self.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ), "This is a helper method for the new learner API." - if self.config.get("_enable_new_api_stack", False) and self.model.is_stateful(): + if ( + self.config.get("enable_rl_module_and_learner", False) + and self.model.is_stateful() + ): # Note that this is a temporary workaround to fit the old sampling stack # to RL Modules. ret = {} @@ -533,7 +536,7 @@ def compute_actions_from_input_dict( # Pass lazy (torch) tensor dict to Model as `input_dict`. input_dict = self._lazy_tensor_dict(input_dict) input_dict.set_training(True) - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): return self._compute_action_helper( input_dict, state_batches=None, @@ -647,7 +650,7 @@ def compute_log_likelihoods( action_dist = dist_class(dist_inputs, self.model) # Default action-dist inputs calculation. else: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): if in_training: output = self.model.forward_train(input_dict) action_dist_cls = self.model.get_train_action_dist_cls() @@ -754,9 +757,11 @@ def load_batch_into_buffer( shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=self.config.get( + "enable_rl_module_and_learner", False + ), padding="last" - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else "zero", ) self._lazy_tensor_dict(batch) @@ -781,9 +786,11 @@ def load_batch_into_buffer( shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=self.config.get( + "enable_rl_module_and_learner", False + ), padding="last" - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else "zero", ) @@ -883,7 +890,7 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): { LEARNER_STATS_KEY: self.stats_fn(batch), "model": {} - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else model.metrics(), NUM_GRAD_UPDATES_LIFETIME: self.num_grad_updates, # -1, b/c we have to measure this diff before we do the update @@ -911,9 +918,11 @@ def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=self.config.get( + "enable_rl_module_and_learner", False + ), padding="last" - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else "zero", ) @@ -992,7 +1001,7 @@ def get_weights(self) -> ModelWeights: @override(Policy) def set_weights(self, weights: ModelWeights) -> None: weights = convert_to_torch_tensor(weights, device=self.device) - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self.model.set_state(weights) else: self.model.load_state_dict(weights) @@ -1007,7 +1016,7 @@ def num_state_tensors(self) -> int: @override(Policy) def get_initial_state(self) -> List[TensorType]: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # convert the tree of tensors to a tree to numpy arrays return tree.map_structure( lambda s: convert_to_numpy(s), self.model.get_initial_state() @@ -1023,12 +1032,15 @@ def get_state(self) -> PolicyState: state["_optimizer_variables"] = [] # In the new Learner API stack, the optimizers live in the learner. - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): for i, o in enumerate(self._optimizers): optim_state_dict = convert_to_numpy(o.state_dict()) state["_optimizer_variables"].append(optim_state_dict) # Add exploration state. - if not self.config.get("_enable_new_api_stack", False) and self.exploration: + if ( + not self.config.get("enable_rl_module_and_learner", False) + and self.exploration + ): # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() @@ -1074,7 +1086,7 @@ def export_model(self, export_dir: str, onnx: Optional[int] = None) -> None: os.makedirs(export_dir, exist_ok=True) - enable_rl_module = self.config.get("_enable_new_api_stack", False) + enable_rl_module = self.config.get("enable_rl_module_and_learner", False) if enable_rl_module and onnx: raise ValueError("ONNX export not supported for RLModule API.") diff --git a/rllib/tests/backward_compat/test_backward_compat.py b/rllib/tests/backward_compat/test_backward_compat.py index c58ea206bf0c..5386aaf925b2 100644 --- a/rllib/tests/backward_compat/test_backward_compat.py +++ b/rllib/tests/backward_compat/test_backward_compat.py @@ -92,6 +92,9 @@ def test_old_algorithm_config_dicts(self): "policies_to_train": ["pol1"], "policy_mapping_fn": lambda aid, episode, worker, **kwargs: "pol1", }, + # Test, whether both keys (that map to the same new key) still work. + "num_workers": 2, + "num_rollout_workers": 2, } config = AlgorithmConfig.from_dict(config_dict) self.assertFalse(config.in_evaluation) @@ -101,6 +104,7 @@ def test_old_algorithm_config_dicts(self): eval_config = config.get_evaluation_config_object() self.assertTrue(eval_config.in_evaluation) self.assertTrue(eval_config.lr == 0.1) + self.assertTrue(config.num_env_runners == 2) register_env( "test", @@ -114,7 +118,7 @@ def test_old_algorithm_config_dicts(self): }, "lr": 0.001, "evaluation_config": { - "num_envs_per_env_runner": 4, + "num_envs_per_worker": 4, # old key -> num_envs_per_env_runner "explore": False, }, "evaluation_num_env_runners": 1, diff --git a/rllib/tests/test_algorithm_rl_module_restore.py b/rllib/tests/test_algorithm_rl_module_restore.py index 0f1c2f616210..f5028f8826b5 100644 --- a/rllib/tests/test_algorithm_rl_module_restore.py +++ b/rllib/tests/test_algorithm_rl_module_restore.py @@ -50,7 +50,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(rollout_fragment_length=4) .environment(MultiAgentCartPole, env_config={"num_agents": num_agents}) .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) @@ -89,7 +89,7 @@ def test_e2e_load_simple_marl_module(self): module_specs=module_specs, load_state_path=marl_checkpoint_path, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=marl_module_spec_from_checkpoint, ) @@ -155,7 +155,7 @@ def test_e2e_load_complex_marl_module(self): module_specs=module_specs, load_state_path=marl_checkpoint_path, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=marl_module_spec_from_checkpoint, ) @@ -188,7 +188,7 @@ def test_e2e_load_rl_module(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(rollout_fragment_length=4) .environment("CartPole-v1") .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) @@ -221,7 +221,7 @@ def test_e2e_load_rl_module(self): load_state_path=module_ckpt_path, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=module_to_load_spec, ) @@ -300,7 +300,7 @@ def test_e2e_load_complex_marl_module_with_modules_to_load(self): "policy_0", }, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=marl_module_spec_from_checkpoint, ) diff --git a/rllib/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py index 02467b60858d..1e32be167df0 100644 --- a/rllib/tests/test_algorithm_save_load_checkpoint_learner.py +++ b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py @@ -32,7 +32,7 @@ def save_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): The learner stats after 2 iterations of training. """ algo_cfg = ( - algo_cfg.experimental(_enable_new_api_stack=True) + algo_cfg.api_stack(enable_rl_module_and_learner=True) .env_runners(num_env_runners=0) # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 # to make sure that we get results as soon as sampling/training is done at @@ -68,7 +68,7 @@ def load_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): """ algo_cfg = ( - algo_cfg.experimental(_enable_new_api_stack=True) + algo_cfg.api_stack(enable_rl_module_and_learner=True) .env_runners(num_env_runners=0) # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 # to make sure that we get results as soon as sampling/training is done at diff --git a/rllib/tests/test_rllib_train_and_evaluate.py b/rllib/tests/test_rllib_train_and_evaluate.py index 3bbe33a16a5a..899dc6d4493e 100644 --- a/rllib/tests/test_rllib_train_and_evaluate.py +++ b/rllib/tests/test_rllib_train_and_evaluate.py @@ -96,7 +96,7 @@ def learn_test_plus_evaluate(algo: str, env="CartPole-v1"): # call rllib train here to see if the RLModule API is enabled. algo_cls = get_trainable_cls(algo) config = algo_cls.get_default_config() - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: eval_ = ', \\"evaluation_config\\": {}' else: eval_ = ', \\"evaluation_config\\": {\\"explore\\": false}' diff --git a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py index dcce4afc042b..730314303263 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py +++ b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py @@ -28,7 +28,7 @@ ) .env_runners( num_envs_per_env_runner=5, - num_rollout_workers=1, + num_env_runners=1, observation_filter="MeanStdFilter", ) .resources(num_gpus=0) diff --git a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml index a11ecb312fe4..99a3e024a23c 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml @@ -6,7 +6,7 @@ cartpole-appo-w-rl-modules-and-learner: timesteps_total: 200000 config: # Run with Learner- and RLModule API (new stack). - _enable_new_api_stack: true + enable_rl_module_and_learner: true # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py index dc68afcddf1a..946a65ad1042 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py @@ -37,7 +37,7 @@ ) .env_runners( num_env_runners=3, - num_envs_per_worker=1, + num_envs_per_env_runner=1, ) # Switch on resiliency (recreate any failed worker). .fault_tolerance( diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py index 970e002b3633..4ac5afd7ea7f 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py @@ -31,7 +31,7 @@ }, ) .env_runners( - num_rollout_workers=4, + num_env_runners=4, num_envs_per_env_runner=1, ) # Switch on resiliency (recreate any failed worker). diff --git a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml index 4d412735d1c8..fa440ff61b46 100644 --- a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml +++ b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml @@ -8,7 +8,7 @@ appo-pongnoframeskip-v5: timesteps_total: 20000000 config: # Run with Learner- and RLModule API (new stack). - _enable_new_api_stack: true + enable_rl_module_and_learner: true # Make analogous to old v4 + NoFrameskip. env_config: frameskip: 1 diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo.py b/rllib/tuned_examples/appo/stateless_cartpole_appo.py index f66f23649649..4a1f4d288157 100644 --- a/rllib/tuned_examples/appo/stateless_cartpole_appo.py +++ b/rllib/tuned_examples/appo/stateless_cartpole_appo.py @@ -5,7 +5,7 @@ config = ( APPOConfig() # TODO: Switch over to new stack once it supports LSTMs. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment(StatelessCartPole) .resources(num_gpus=0) .env_runners(num_env_runners=1, observation_filter="MeanStdFilter") diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py index c8e26ab4763f..af5f352600bd 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py @@ -4,7 +4,6 @@ from ray.rllib.algorithms.dqn.dqn import DQNConfig from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune import Stopper from ray import train, tune @@ -237,8 +236,8 @@ for env in benchmark_envs.keys(): tune.register_env( env, - lambda ctx: AtariPreprocessing( - gym.make(env, **ctx), grayscale_newaxis=True, screen_size=84, noop_max=0 + lambda ctx, e=env: AtariPreprocessing( + gym.make(e, **ctx), grayscale_newaxis=True, screen_size=84, noop_max=0 ), ) @@ -297,12 +296,14 @@ def stop_all(self): clip_rewards=True, ) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=1, + num_env_runners=1, env_to_module_connector=_make_env_to_module_connector, ) # TODO (simon): Adjust to new model_config_dict. diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index 8ee3937c13cc..9fab4f934362 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -1,7 +1,6 @@ import gymnasium as gym from ray.rllib.algorithms.dqn.dqn import DQNConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack from ray.tune import Stopper from ray import train, tune @@ -237,7 +236,7 @@ env, # Use the RLlib atari wrapper to squeeze images to 84x84. # Note, the default of this wrapper is `framestack=4`. - lambda ctx: wrap_atari_for_new_api_stack(gym.make(env, **ctx), dim=84), + lambda ctx, e=env: wrap_atari_for_new_api_stack(gym.make(e, **ctx), dim=84), ) @@ -290,12 +289,14 @@ def stop_all(self): clip_rewards=True, ) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=1, + num_env_runners=1, ) .resources( # We have a train/sample ratio of 1:1 and a batch of 32. diff --git a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py index c4faffe24609..3c3d4ff6b4d6 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py @@ -1,15 +1,14 @@ from ray.rllib.algorithms.dqn import DQNConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( DQNConfig() .environment(env="CartPole-v1") .framework(framework="torch") - .experimental(_enable_new_api_stack=True) - .env_runners( - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(num_env_runners=0) .resources( num_learner_workers=0, ) diff --git a/rllib/tuned_examples/impala/cartpole-impala.yaml b/rllib/tuned_examples/impala/cartpole-impala.yaml index 5fc12c5ccd21..249a8d8db420 100644 --- a/rllib/tuned_examples/impala/cartpole-impala.yaml +++ b/rllib/tuned_examples/impala/cartpole-impala.yaml @@ -5,7 +5,7 @@ cartpole-impala: sampler_results/episode_reward_mean: 150 timesteps_total: 500000 config: - _enable_new_api_stack: true + enable_rl_module_and_learner: true # Works for both torch and tf. framework: tf2 num_gpus: 0 diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py index f05a63c55319..f6d53f61ba5b 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py @@ -1,5 +1,4 @@ from ray.rllib.algorithms.ppo.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune import Stopper from ray import train, tune @@ -80,11 +79,13 @@ def stop_all(self): PPOConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( - env_runner_cls=SingleAgentEnvRunner, # Following the paper. - num_rollout_workers=32, + num_env_runners=32, rollout_fragment_length=512, ) .resources( diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py index a7a21431872b..3683c0fb2a38 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py @@ -1,6 +1,5 @@ import time from ray.rllib.algorithms.ppo.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune.schedulers.pb2 import PB2 from ray import train, tune @@ -70,11 +69,13 @@ PPOConfig() .environment(env=env) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length=1, - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=num_rollout_workers, + num_env_runners=num_rollout_workers, # TODO (sven, simon): Add resources. ) .resources( diff --git a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py index 01be9b453b67..dd1282df1a60 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py @@ -1,15 +1,14 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( PPOConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) - .env_runners( - env_runner_cls=SingleAgentEnvRunner, - num_env_runners=1, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(num_env_runners=1) .environment("CartPole-v1") .rl_module( model_config_dict={ diff --git a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py index bd9db73545e7..65c6d3dc4261 100644 --- a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py +++ b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py @@ -1,15 +1,16 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.examples.envs.classes.random_env import RandomLargeObsSpaceEnv config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # Switch off np.random, which is known to have memory leaks. .environment(RandomLargeObsSpaceEnv, env_config={"static_samples": True}) .env_runners( - env_runner_cls=SingleAgentEnvRunner, num_env_runners=4, num_envs_per_env_runner=5, ) diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py index 4044f0b6f09f..3c9e2224ecab 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py @@ -1,5 +1,4 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum from ray.tune.registry import register_env @@ -8,10 +7,12 @@ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("multi_agent_pendulum") .env_runners( - env_runner_cls=MultiAgentEnvRunner, num_envs_per_env_runner=1, num_env_runners=2, ) diff --git a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py index 4ade7348b20f..7d2d03c415cd 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py @@ -1,13 +1,14 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( PPOConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( - env_runner_cls=SingleAgentEnvRunner, num_env_runners=2, num_envs_per_env_runner=20, ) diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py index 7f33a20b3913..9579810f647d 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py @@ -1,5 +1,4 @@ from ray.rllib.algorithms.sac.sac import SACConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune import Stopper from ray import train, tune @@ -68,11 +67,13 @@ def stop_all(self): SACConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length=1, - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + num_env_runners=0, ) .resources( # Note, we have a sample/train ratio of 1:1 and a small train diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py index 414b94833a5e..0d73a0816bc4 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py @@ -1,6 +1,5 @@ import time from ray.rllib.algorithms.sac.sac import SACConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune.schedulers.pb2 import PB2 from ray import train, tune @@ -58,11 +57,13 @@ SACConfig() .environment(env=env) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length="auto", - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=1, + num_env_runners=1, # TODO (sven, simon): Add resources. ) .resources( diff --git a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py index 87a1ba56e0e0..2b04c62b099c 100644 --- a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py +++ b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py @@ -1,13 +1,14 @@ from ray.rllib.algorithms.sac.sac import SACConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( SACConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length=1, - env_runner_cls=SingleAgentEnvRunner, num_env_runners=0, ) .environment(env="Pendulum-v1") diff --git a/rllib/utils/checkpoints.py b/rllib/utils/checkpoints.py index 193742f39bef..1c5a989ce117 100644 --- a/rllib/utils/checkpoints.py +++ b/rllib/utils/checkpoints.py @@ -223,7 +223,7 @@ def convert_to_msgpack_checkpoint( state["worker"]["is_policy_to_train"] = NOT_SERIALIZABLE # Add RLlib checkpoint version (as string). - if state["config"]["_enable_new_api_stack"]: + if state["config"]["enable_rl_module_and_learner"]: state["checkpoint_version"] = str(CHECKPOINT_VERSION_LEARNER) else: state["checkpoint_version"] = str(CHECKPOINT_VERSION) diff --git a/rllib/utils/debug/memory.py b/rllib/utils/debug/memory.py index ab4641521baf..5f7944c08177 100644 --- a/rllib/utils/debug/memory.py +++ b/rllib/utils/debug/memory.py @@ -112,7 +112,7 @@ def code(): results_per_category["policy"].extend(test) # Testing this only makes sense if the learner API is disabled. - if not policy.config.get("_enable_new_api_stack", False): + if not policy.config.get("enable_rl_module_and_learner", False): # Call `learn_on_batch()` n times. dummy_batch = policy._get_dummy_batch_from_view_requirements(batch_size=16) @@ -172,7 +172,9 @@ def code(): if test: results_per_category["rollout_worker"].extend(test) - if "learner" in to_check and algorithm.config.get("_enable_new_api_stack", False): + if "learner" in to_check and algorithm.config.get( + "enable_rl_module_and_learner", False + ): learner_group = algorithm.learner_group assert learner_group._is_local, ( "This test will miss leaks hidden in remote " diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index 8d15b74c751a..b0cb4a4ff915 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -22,7 +22,7 @@ def do_test_explorations(config, dummy_obs, prev_a=None, expected_mean_action=No for exploration in [None, "Random"]: local_config = config.copy() if exploration == "Random": - if local_config._enable_new_api_stack: + if local_config.enable_rl_module_and_learner: # TODO(Artur): Support Random exploration with RL Modules. continue local_config.env_runners(exploration_config={"type": "Random"}) diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py index 87dbc5571813..422c36c0c4cf 100644 --- a/rllib/utils/metrics/stats.py +++ b/rllib/utils/metrics/stats.py @@ -413,7 +413,8 @@ def _reduced_values(self) -> Tuple[Any, Any]: return mean_value, [mean_value] # Do non-EMA reduction (possibly using a window). else: - reduce_meth = getattr(np, self._reduce_method) + # Use the numpy "nan"-prefix to ignore NaN's in our value lists. + reduce_meth = getattr(np, "nan" + self._reduce_method) values = ( self.values if self._window is None else self.values[-self._window :] ) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 76f012cc6426..a9aac8d012b5 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -101,7 +101,7 @@ def add_rllib_example_script_args( parser.add_argument( "--enable-new-api-stack", action="store_true", - help="Whether to use the _enable_new_api_stack config setting.", + help="Whether to use the `enable_rl_module_and_learner` config setting.", ) parser.add_argument( "--framework", @@ -423,7 +423,7 @@ def _test( input_dict[SampleBatch.PREV_ACTIONS] = action_in input_dict[SampleBatch.PREV_REWARDS] = reward_in if state_in: - if what.config.get("_enable_new_api_stack", False): + if what.config.get("enable_rl_module_and_learner", False): input_dict["state_in"] = state_in else: for i, s in enumerate(state_in): @@ -895,7 +895,7 @@ def framework_iterator( for fw in frameworks: # Skip tf if on new API stack. - if fw == "tf" and config.get("_enable_new_api_stack", False): + if fw == "tf" and config.get("enable_rl_module_and_learner", False): logger.warning("Skipping `framework=tf` (new API stack configured)!") continue # Skip if tf/tf2 and py >= 3.11. @@ -1355,30 +1355,17 @@ def run_rllib_example_script_experiment( "training_iteration": args.stop_iters, } - from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner - from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner - # Enhance the `base_config`, based on provided `args`. config = ( # Set the framework. base_config.framework(args.framework) # Enable the new API stack? - .experimental(_enable_new_api_stack=args.enable_new_api_stack) - # Define EnvRunner/RolloutWorker scaling and behavior. - .env_runners( - num_env_runners=args.num_env_runners, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else ( - SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ) - ), + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, ) + # Define EnvRunner/RolloutWorker scaling and behavior. + .env_runners(num_env_runners=args.num_env_runners) # Define compute resources used. .resources( # Old stack. @@ -1402,7 +1389,7 @@ def run_rllib_example_script_experiment( ] print(f" R(eval)={Reval}", end="") print() - for key, value in stop.items(): + for key, threshold in stop.items(): val = results for k in key.split("/"): try: @@ -1410,8 +1397,8 @@ def run_rllib_example_script_experiment( except KeyError: val = None break - if val is not None and val >= value: - print(f"Stop criterium ({key}={value}) fulfilled!") + if val is not None and not np.isnan(val) and val >= threshold: + print(f"Stop criterium ({key}={threshold}) fulfilled!") return results ray.shutdown() return results @@ -1585,7 +1572,7 @@ def check_reproducibilty( num_gpus: int(os.environ.get("RLLIB_NUM_GPUS", "0")) num_workers: 0 (only local workers) or 4 ((1) local workers + (4) remote workers) - num_envs_per_worker: 2 + num_envs_per_env_runner: 2 Args: algo_class: Algorithm class to test. @@ -1616,7 +1603,7 @@ def check_reproducibilty( # new API num_gpus_per_learner_worker=int(os.environ.get("RLLIB_NUM_GPUS", "0")), ) - .env_runners(num_rollout_workers=num_workers, num_envs_per_worker=2) + .env_runners(num_env_runners=num_workers, num_envs_per_env_runner=2) ) for fw in framework_iterator(algo_config, **fw_kwargs): @@ -1647,7 +1634,7 @@ def check_reproducibilty( # iterations). # As well as training behavior (minibatch sequence during SGD # iterations). - if algo_config._enable_new_api_stack: + if algo_config.enable_rl_module_and_learner: check( results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID], results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID], @@ -2010,7 +1997,7 @@ def _do_check(alg, config, a_name, o_name): config_copy = config.copy() config_copy.validate() # If RLModules are enabled, we need to skip a few tests for now: - if config_copy._enable_new_api_stack: + if config_copy.enable_rl_module_and_learner: # Skip PPO cases in which RLModules don't support the given spaces yet. if o_name not in rlmodule_supported_observation_spaces: logger.warning( @@ -2089,7 +2076,7 @@ def _do_check(alg, config, a_name, o_name): if not frameworks: frameworks = ("tf2", "tf", "torch") - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: # Only test the frameworks that are supported by RLModules. frameworks = tuple( fw for fw in frameworks if fw in rlmodule_supported_frameworks