From 596a4d88ab27cbecf4b8d10ff0cc45d304f9b026 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 30 Aug 2024 21:15:58 +0200 Subject: [PATCH 01/20] wip Signed-off-by: sven1977 --- rllib/algorithms/ppo/ppo.py | 17 +++++++---- rllib/core/learner/learner.py | 2 ++ rllib/policy/sample_batch.py | 14 +++++++-- rllib/tests/test_lstm.py | 1 - .../ppo/multi_agent_pendulum_ppo.py | 17 +++++------ rllib/tuned_examples/ppo/pendulum_ppo.py | 29 +++++-------------- rllib/utils/minibatch_utils.py | 9 ++++++ rllib/utils/test_utils.py | 6 ++-- 8 files changed, 51 insertions(+), 44 deletions(-) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index fd261c44309c..2fa7c2d0d589 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -142,7 +142,7 @@ def __init__(self, algo_class=None): # Simple logic for now: If None, use `train_batch_size`. self.mini_batch_size_per_learner = None self.num_sgd_iter = 30 - self.shuffle_sequences = True + self.shuffle_single_agent_batch = True self.vf_loss_coeff = 1.0 self.entropy_coeff = 0.0 self.entropy_coeff_schedule = None @@ -220,7 +220,7 @@ def training( mini_batch_size_per_learner: Optional[int] = NotProvided, sgd_minibatch_size: Optional[int] = NotProvided, num_sgd_iter: Optional[int] = NotProvided, - shuffle_sequences: Optional[bool] = NotProvided, + shuffle_single_agent_batch: Optional[bool] = NotProvided, vf_loss_coeff: Optional[float] = NotProvided, entropy_coeff: Optional[float] = NotProvided, entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, @@ -260,8 +260,13 @@ def training( new API stack (use `mini_batch_size_per_learner` instead). num_sgd_iter: Number of SGD iterations in each outer loop (i.e., number of epochs to execute per train batch). - shuffle_sequences: Whether to shuffle sequences in the batch when training - (recommended). + shuffle_single_agent_batch: Whether to shuffle each single-agent batch once + before a new epoch (which consists of n x minibatches, where n is + `batch_size_per_learner` // `mini_batch_size_per_learner`). This should + be set to True in single-agent and independent multi-agent cases as it + ensures proper mixing of the samples before each batch epoch. Otherwise, + the sequence of minibatches iterated through is the same in each + iteration, possibly impacting learning. vf_loss_coeff: Coefficient of the value function loss. IMPORTANT: you must tune this if you set vf_share_layers=True inside your model's config. entropy_coeff: The entropy coefficient (float) or entropy coefficient @@ -302,8 +307,8 @@ def training( self.sgd_minibatch_size = sgd_minibatch_size if num_sgd_iter is not NotProvided: self.num_sgd_iter = num_sgd_iter - if shuffle_sequences is not NotProvided: - self.shuffle_sequences = shuffle_sequences + if shuffle_single_agent_batch is not NotProvided: + self.shuffle_single_agent_batch = shuffle_single_agent_batch if vf_loss_coeff is not NotProvided: self.vf_loss_coeff = vf_loss_coeff if entropy_coeff is not NotProvided: diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 01d2c11da3b6..98b15a5cab18 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -1300,6 +1300,7 @@ def _update_from_batch_or_episodes( MiniBatchCyclicIterator, uses_new_env_runners=True, num_total_mini_batches=num_total_mini_batches, + shuffle=self.config.shuffle_single_agent_batch, ) else: batch_iter = MiniBatchCyclicIterator @@ -1310,6 +1311,7 @@ def _update_from_batch_or_episodes( # this behavior here by setting the minibatch size to be the size # of the batch (e.g. 1 minibatch of size batch.count) minibatch_size = batch.count + # Note that there is no need to shuffle here, b/c we don't have minibatches. batch_iter = MiniBatchCyclicIterator else: # `minibatch_size` and `num_iters` are not set by the user. diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py index 8ed604728fc7..098ddc2218ad 100644 --- a/rllib/policy/sample_batch.py +++ b/rllib/policy/sample_batch.py @@ -462,23 +462,31 @@ def shuffle(self) -> "SampleBatch": {"a": [4, 1, 3, 2]} """ + has_time_rank = self.get(SampleBatch.SEQ_LENS) is not None # Shuffling the data when we have `seq_lens` defined is probably # a bad idea! - if self.get(SampleBatch.SEQ_LENS) is not None: + if has_time_rank and not self.zero_padded: raise ValueError( "SampleBatch.shuffle not possible when your data has " - "`seq_lens` defined!" + "`seq_lens` defined AND is not zero-padded yet!" ) # Get a permutation over the single items once and use the same # permutation for all the data (otherwise, data would become # meaningless). - permutation = np.random.permutation(self.count) + # - Shuffle by individual item. + if not has_time_rank: + permutation = np.random.permutation(self.count) + # - Shuffle along batch axis (leave axis=1/time-axis as-is). + else: + permutation = np.random.permutation(len(self[SampleBatch.SEQ_LENS])) self_as_dict = dict(self) shuffled = tree.map_structure(lambda v: v[permutation], self_as_dict) + self.update(shuffled) + # Flush cache such that intercepted values are recalculated after the # shuffling. self.intercepted_values = {} diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py index 245d3db9b055..d93951be0f67 100644 --- a/rllib/tests/test_lstm.py +++ b/rllib/tests/test_lstm.py @@ -261,7 +261,6 @@ def test_minibatch_sequencing(self): "max_seq_len": 4, "vf_share_layers": True, }, - shuffle_sequences=False, # for deterministic testing ) ) ppo = config.build() diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py index 757f6bbda8c1..082d505efcce 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py @@ -26,20 +26,19 @@ enable_env_runner_and_connector_v2=True, ) .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents}) + .env_runners(num_env_runners=4) + .training( + lr=0.0003, + lambda_=0.1, + vf_clip_param=10.0, + num_sgd_iter=6, + ) .rl_module( model_config_dict={ "fcnet_activation": "relu", "uses_new_env_runners": True, }, ) - .training( - train_batch_size=512, - lambda_=0.1, - gamma=0.95, - lr=0.0003, - sgd_minibatch_size=64, - vf_clip_param=10.0, - ) .multi_agent( policy_mapping_fn=lambda aid, *arg, **kw: f"p{aid}", policies={f"p{i}" for i in range(args.num_agents)}, @@ -49,7 +48,7 @@ stop = { NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, # Divide by num_agents to get actual return per agent. - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0 * (args.num_agents or 1), + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -300.0 * (args.num_agents or 1), } diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py index b74dfb5db827..84c0ddd74f90 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo.py @@ -1,13 +1,7 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - EVALUATION_RESULTS, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args() +parser = add_rllib_example_script_args(default_timesteps=400000, default_reward=-300.0) parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. @@ -22,34 +16,25 @@ ) .env_runners( num_env_runners=2, - num_envs_per_env_runner=20, + num_envs_per_env_runner=10, ) .environment("Pendulum-v1") .training( - train_batch_size_per_learner=512, - gamma=0.95, lr=0.0003, lambda_=0.1, vf_clip_param=10.0, - sgd_minibatch_size=64, - model={ + num_sgd_iter=6, + ) + .rl_module( + model_config_dict={ "fcnet_activation": "relu", "uses_new_env_runners": True, }, ) - .evaluation( - evaluation_num_env_runners=1, - evaluation_interval=1, - evaluation_parallel_to_training=True, - ) ) -stop = { - NUM_ENV_STEPS_SAMPLED_LIFETIME: 400000, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0, -} if __name__ == "__main__": from ray.rllib.utils.test_utils import run_rllib_example_script_experiment - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment(config, args) diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index 1fccb2e2fb0c..883d08d84ade 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -50,6 +50,7 @@ def __init__( num_iters: int = 1, uses_new_env_runners: bool = False, num_total_mini_batches: int = 0, + shuffle: bool = False, ) -> None: super().__init__(batch, minibatch_size, num_iters) self._batch = batch @@ -66,6 +67,8 @@ def __init__( self._mini_batch_count = 0 self._num_total_mini_batches = num_total_mini_batches + self._shuffle = shuffle + def __iter__(self): while ( # Make sure each item in the total batch gets at least iterated over @@ -83,6 +86,12 @@ def __iter__(self): minibatch = {} for module_id, module_batch in self._batch.policy_batches.items(): + # Shuffle the individual single-agent batch, if required. + # This should happen once per minibatch iteration in order to make + # each iteration go through a different set of minibatches. + if self._shuffle: + module_batch.shuffle() + if len(module_batch) == 0: raise ValueError( f"The batch for module_id {module_id} is empty! " diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 8925024ee764..dd90ca5fedc6 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1425,10 +1425,10 @@ def run_rllib_example_script_experiment( for i in range(stop.get(TRAINING_ITERATION, args.stop_iters)): results = algo.train() if ENV_RUNNER_RESULTS in results: - print( - f"iter={i} R={results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}", - end="", + mean_return = results[ENV_RUNNER_RESULTS].get( + EPISODE_RETURN_MEAN, np.nan ) + print(f"iter={i} R={mean_return}", end="") if EVALUATION_RESULTS in results: Reval = results[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][ EPISODE_RETURN_MEAN From 06ec0d1879f053a2db827cf2d8bb762239ab40a2 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 2 Sep 2024 10:10:09 +0200 Subject: [PATCH 02/20] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 26 ++++ rllib/algorithms/appo/appo.py | 19 ++- rllib/algorithms/appo/appo_learner.py | 6 +- rllib/algorithms/impala/impala.py | 89 +++++++------- rllib/algorithms/impala/impala_learner.py | 26 +++- rllib/algorithms/marwil/marwil.py | 2 +- rllib/algorithms/ppo/ppo.py | 77 +++--------- rllib/algorithms/ppo/tests/test_ppo.py | 6 +- .../ppo/tests/test_ppo_with_env_runner.py | 2 +- rllib/algorithms/ppo/tests/test_repro_ppo.py | 4 +- rllib/algorithms/tests/test_algorithm.py | 8 +- .../test_algorithm_checkpoint_restore.py | 2 +- .../tests/test_algorithm_rl_module_restore.py | 4 +- ..._algorithm_save_load_checkpoint_learner.py | 2 +- .../tests/test_callbacks_old_stack.py | 2 +- .../tests/test_callbacks_on_env_runner.py | 8 +- .../algorithms/tests/test_worker_failures.py | 10 +- .../run_ppo_with_inference_bm.py | 4 +- rllib/core/learner/learner.py | 112 +++++++++--------- rllib/core/learner/learner_group.py | 81 +++++-------- rllib/env/tests/test_multi_agent_env.py | 4 +- .../evaluation/tests/test_envs_that_crash.py | 2 +- rllib/evaluation/tests/test_rollout_worker.py | 2 +- .../tests/test_trajectory_view_api.py | 2 +- .../examples/actions/nested_action_spaces.py | 2 +- ...raining_step_on_and_off_policy_combined.py | 2 +- rllib/examples/cartpole_lstm.py | 2 +- .../examples/catalogs/mobilenet_v2_encoder.py | 2 +- .../restore_1_of_n_agents_from_checkpoint.py | 2 +- .../flatten_observations_dict_space.py | 2 +- rllib/examples/connectors/frame_stacking.py | 4 +- .../examples/connectors/mean_std_filtering.py | 2 +- .../connectors/prev_actions_prev_rewards.py | 2 +- .../curiosity/count_based_curiosity.py | 2 +- ...trinsic_curiosity_model_based_curiosity.py | 2 +- .../curriculum/curriculum_learning.py | 2 +- .../debugging/deterministic_training.py | 2 +- .../envs/env_rendering_and_recording.py | 2 +- rllib/examples/envs/greyscale_env.py | 4 +- rllib/examples/envs/unity3d_env_local.py | 4 +- .../gpus/float16_training_and_inference.py | 2 +- ...ed_precision_training_float16_inference.py | 2 +- ...cy_inference_after_training_w_connector.py | 2 +- .../learners/custom_loss_fn_simple.py | 2 +- .../learners/separate_vf_lr_and_optimizer.py | 2 +- .../learners/train_w_bc_finetune_w_ppo.py | 2 +- .../multi_agent/multi_agent_pendulum.py | 2 +- .../self_play_league_based_with_open_spiel.py | 2 +- .../multi_agent/self_play_with_open_spiel.py | 4 +- rllib/examples/multi_agent/two_algorithms.py | 2 +- rllib/examples/quadx_waypoints.py | 2 +- .../rl_modules/classes/lstm_containing_rlm.py | 2 +- .../rl_modules/classes/mobilenet_rlm.py | 2 +- .../rl_modules/classes/tiny_atari_cnn_rlm.py | 2 +- .../rl_modules/custom_lstm_rl_module.py | 2 +- rllib/execution/train_ops.py | 16 +-- rllib/models/tests/test_attention_nets.py | 6 +- rllib/models/tests/test_lstms.py | 4 +- rllib/models/tests/test_preprocessors.py | 6 +- rllib/policy/dynamic_tf_policy.py | 7 +- rllib/policy/dynamic_tf_policy_v2.py | 4 +- rllib/policy/torch_policy.py | 2 +- rllib/policy/torch_policy_v2.py | 3 +- .../checkpoints/create_checkpoints.py | 2 +- rllib/tests/test_io.py | 2 +- rllib/tests/test_lstm.py | 8 +- rllib/tests/test_nested_observation_spaces.py | 8 +- rllib/tests/test_supported_multi_agent.py | 2 +- rllib/tests/test_supported_spaces.py | 8 +- .../appo/cartpole-appo-separate-losses.py | 2 +- rllib/tuned_examples/appo/cartpole-appo.yaml | 2 +- rllib/tuned_examples/appo/cartpole_appo.py | 21 ++-- .../appo/frozenlake-appo-vtrace.yaml | 2 +- .../tuned_examples/appo/halfcheetah-appo.yaml | 2 +- ...ulti-agent-cartpole-w-100-policies-appo.py | 2 +- ...multi_agent_cartpole_appo_old_api_stack.py | 2 +- .../multi_agent_stateless_cartpole_appo.py | 7 +- rllib/tuned_examples/appo/pendulum-appo.yaml | 2 +- .../pong-appo-w-rl-modules-and-learner.yaml | 2 +- rllib/tuned_examples/appo/pong-appo.yaml | 2 +- .../appo/stateless_cartpole_appo.py | 2 +- rllib/tuned_examples/bc/cartpole_recording.py | 2 +- .../compact-regression-test.yaml | 8 +- .../impala/cartpole-impala-separate-losses.py | 2 +- ...lti_agent_cartpole_impala_old_api_stack.py | 2 +- rllib/tuned_examples/ppo/atari_ppo.py | 4 +- .../ppo/benchmark_ppo_mujoco.py | 4 +- .../ppo/benchmark_ppo_mujoco_pb2.py | 10 +- rllib/tuned_examples/ppo/cartpole-ppo.yaml | 2 +- rllib/tuned_examples/ppo/cartpole_ppo.py | 2 +- .../ppo/cartpole_truncated_ppo.py | 2 +- rllib/tuned_examples/ppo/halfcheetah-ppo.yaml | 4 +- rllib/tuned_examples/ppo/hopper-ppo.yaml | 4 +- .../tuned_examples/ppo/humanoid-ppo-gae.yaml | 4 +- rllib/tuned_examples/ppo/humanoid-ppo.yaml | 4 +- .../ppo/memory-leak-test-ppo.yaml | 4 +- .../ppo/memory_leak_test_ppo_new_stack.py | 2 +- .../ppo/multi_agent_cartpole_ppo.py | 2 +- .../ppo/multi_agent_pendulum_ppo.py | 2 +- .../ppo/multi_agent_stateless_cartpole_ppo.py | 2 +- rllib/tuned_examples/ppo/pendulum-ppo.yaml | 2 +- .../ppo/pendulum-transformed-actions-ppo.yaml | 4 +- rllib/tuned_examples/ppo/pendulum_ppo.py | 2 +- .../ppo/repeatafterme-ppo-lstm.yaml | 2 +- .../ppo/stateless_cartpole_ppo.py | 2 +- ...unity3d-soccer-strikers-vs-goalie-ppo.yaml | 4 +- rllib/tuned_examples/ppo/walker2d-ppo.yaml | 4 +- .../utils/exploration/tests/test_curiosity.py | 2 +- rllib/utils/minibatch_utils.py | 95 ++++++++------- rllib/utils/tests/test_minibatch_utils.py | 44 +++---- 110 files changed, 454 insertions(+), 469 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 5dd8de0bcc0f..0cedad5da905 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -382,6 +382,13 @@ def __init__(self, algo_class: Optional[type] = None): # Simple logic for now: If None, use `train_batch_size`. self.train_batch_size_per_learner = None self.train_batch_size = 32 # @OldAPIStack + + # These setting have been adopted from the original PPO batch settings: + # num_sgd_iter, minibatch_size, and shuffle_sequences. + self.num_epochs = 1 + self.shuffle_batch_per_epoch = False + self.minibatch_size = None + # TODO (sven): Unsolved problem with RLModules sometimes requiring settings from # the main AlgorithmConfig. We should not require the user to provide those # settings in both, the AlgorithmConfig (as property) AND the model config @@ -2047,6 +2054,9 @@ def training( grad_clip_by: Optional[str] = NotProvided, train_batch_size: Optional[int] = NotProvided, train_batch_size_per_learner: Optional[int] = NotProvided, + num_epochs: Optional[int] = NotProvided, + shuffle_batch_per_epoch: Optional[bool] = NotProvided, + minibatch_size: Optional[int] = NotProvided, model: Optional[dict] = NotProvided, optimizer: Optional[dict] = NotProvided, max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided, @@ -2105,6 +2115,15 @@ def training( stack, this setting should no longer be used. Instead, use `train_batch_size_per_learner` (in combination with `num_learners`). + num_epochs: The number of complete passes over the entire train batch (per + Learner). Each pass might be further split into n minibatches (if + `minibatch_size` provided). + shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch. + If the train batch has a time rank (axis=1), shuffling will only take + place along the batch axis to not disturb any intact (episode) + trajectories. + minibatch_size: The size of minibatches to use to further split the train + batch into. model: Arguments passed into the policy model. See models/catalog.py for a full list of the available model options. TODO: Provide ModelConfig objects instead of dicts. @@ -2168,6 +2187,13 @@ def training( self.train_batch_size_per_learner = train_batch_size_per_learner if train_batch_size is not NotProvided: self.train_batch_size = train_batch_size + if num_epochs is not NotProvided: + self.num_epochs = num_epochs + if shuffle_batch_per_epoch is not NotProvided: + self.shuffle_batch_per_epoch = shuffle_batch_per_epoch + if minibatch_size is not NotProvided: + self.minibatch_size = minibatch_size + if model is not NotProvided: self.model.update(model) if ( diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 570e40087f98..73ceef6f3264 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -102,18 +102,11 @@ def __init__(self, algo_class=None): # Override some of IMPALAConfig's default values with APPO-specific values. self.num_env_runners = 2 self.min_time_s_per_iteration = 10 - self.num_gpus = 0 - self.num_multi_gpu_tower_stacks = 1 - self.minibatch_buffer_size = 1 - self.num_sgd_iter = 1 self.target_network_update_freq = 1 - self.replay_proportion = 0.0 - self.replay_buffer_num_slots = 100 self.learner_queue_size = 16 self.learner_queue_timeout = 300 self.max_sample_requests_in_flight_per_worker = 2 self.broadcast_interval = 1 - self.grad_clip = 40.0 # Note: Only when using enable_rl_module_and_learner=True can the clipping mode # be configured by the user. On the old API stack, RLlib will always clip by @@ -140,6 +133,12 @@ def __init__(self, algo_class=None): # Add constructor kwargs here (if any). } + self.num_gpus = 0 # @OldAPIStack + self.num_multi_gpu_tower_stacks = 1 # @OldAPIStack + self.minibatch_buffer_size = 1 # @OldAPIStack + self.replay_proportion = 0.0 # @OldAPIStack + self.replay_buffer_num_slots = 100 # @OldAPIStack + # __sphinx_doc_end__ # fmt: on @@ -185,7 +184,7 @@ def training( target_network_update_freq: The frequency to update the target policy and tune the kl loss coefficients that are used during training. After setting this parameter, the algorithm waits for at least - `target_network_update_freq * minibatch_size * num_sgd_iter` number of + `target_network_update_freq * minibatch_size * num_epochs` number of samples to be trained on by the learner group before updating the target networks and tuned the kl loss coefficients that are used during training. @@ -292,7 +291,7 @@ def training_step(self) -> ResultDict: # Update the target network and the KL coefficient for the APPO-loss. # The target network update frequency is calculated automatically by the product - # of `num_sgd_iter` setting (usually 1 for APPO) and `minibatch_buffer_size`. + # of `num_epochs` setting (usually 1 for APPO) and `minibatch_buffer_size`. if self.config.enable_rl_module_and_learner: if NUM_TARGET_UPDATES in train_results: self._counters[NUM_TARGET_UPDATES] += train_results[NUM_TARGET_UPDATES] @@ -309,7 +308,7 @@ def training_step(self) -> ResultDict: ) ] target_update_freq = ( - self.config.num_sgd_iter * self.config.minibatch_buffer_size + self.config.num_epochs * self.config.minibatch_buffer_size ) if cur_ts - last_update > target_update_freq: self._counters[NUM_TARGET_UPDATES] += 1 diff --git a/rllib/algorithms/appo/appo_learner.py b/rllib/algorithms/appo/appo_learner.py index a1c06a854309..ff67637f4257 100644 --- a/rllib/algorithms/appo/appo_learner.py +++ b/rllib/algorithms/appo/appo_learner.py @@ -90,14 +90,14 @@ def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None: # TODO (avnish) Using steps trained here instead of sampled ... I'm not sure # why the other implementation uses sampled. # The difference in steps sampled/trained is pretty - # much always going to be larger than self.config.num_sgd_iter * + # much always going to be larger than self.config.num_epochs * # self.config.minibatch_buffer_size unless the number of steps collected # is really small. The thing is that the default rollout fragment length - # is 50, so the minibatch buffer size * num_sgd_iter is going to be + # is 50, so the minibatch buffer size * num_epochs is going to be # have to be 50 to even meet the threshold of having delayed target # updates. # We should instead have the target / kl threshold update be based off - # of the train_batch_size * some target update frequency * num_sgd_iter. + # of the train_batch_size * some target update frequency * num_epochs. last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS) if timestep - self.metrics.peek( diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index a06b9280dbf1..9ad590f72f34 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -134,7 +134,6 @@ def __init__(self, algo_class=None): self.vtrace_clip_pg_rho_threshold = 1.0 self.num_multi_gpu_tower_stacks = 1 # @OldAPIstack self.minibatch_buffer_size = 1 # @OldAPIstack - self.num_sgd_iter = 1 self.replay_proportion = 0.0 # @OldAPIstack self.replay_buffer_num_slots = 0 # @OldAPIstack self.learner_queue_size = 3 @@ -171,7 +170,7 @@ def __init__(self, algo_class=None): self.rollout_fragment_length = 50 self.train_batch_size = 500 # @OldAPIstack self.train_batch_size_per_learner = 500 - self._minibatch_size = "auto" + #self._minibatch_size = "auto" self.num_env_runners = 2 self.num_gpus = 1 # @OldAPIstack self.lr = 0.0005 @@ -200,8 +199,6 @@ def training( num_gpu_loader_threads: Optional[int] = NotProvided, num_multi_gpu_tower_stacks: Optional[int] = NotProvided, minibatch_buffer_size: Optional[int] = NotProvided, - minibatch_size: Optional[Union[int, str]] = NotProvided, - num_sgd_iter: Optional[int] = NotProvided, replay_proportion: Optional[float] = NotProvided, replay_buffer_num_slots: Optional[int] = NotProvided, learner_queue_size: Optional[int] = NotProvided, @@ -252,15 +249,7 @@ def training( - This enables us to preload data into these stacks while another stack is performing gradient calculations. minibatch_buffer_size: How many train batches should be retained for - minibatching. This conf only has an effect if `num_sgd_iter > 1`. - minibatch_size: The size of minibatches that are trained over during - each SGD iteration. If "auto", will use the same value as - `train_batch_size`. - Note that this setting only has an effect if - `enable_rl_module_and_learner=True` and it must be a multiple of - `rollout_fragment_length` or `sequence_length` and smaller than or equal - to `train_batch_size`. - num_sgd_iter: Number of passes to make over each train batch. + minibatching. This conf only has an effect if `num_epochs > 1`. replay_proportion: Set >0 to enable experience replay. Saved samples will be replayed with a p:1 proportion to new data samples. replay_buffer_num_slots: Number of sample batches to store for replay. @@ -330,8 +319,6 @@ def training( self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks if minibatch_buffer_size is not NotProvided: self.minibatch_buffer_size = minibatch_buffer_size - if num_sgd_iter is not NotProvided: - self.num_sgd_iter = num_sgd_iter if replay_proportion is not NotProvided: self.replay_proportion = replay_proportion if replay_buffer_num_slots is not NotProvided: @@ -374,8 +361,6 @@ def training( self._separate_vf_optimizer = _separate_vf_optimizer if _lr_vf is not NotProvided: self._lr_vf = _lr_vf - if minibatch_size is not NotProvided: - self._minibatch_size = minibatch_size return self @@ -450,21 +435,21 @@ def validate(self) -> None: "config.training(_tf_policy_handles_more_than_one_loss=True)." ) # Learner API specific checks. - if ( - self.enable_rl_module_and_learner - and self._minibatch_size != "auto" - and not ( - (self.minibatch_size % self.rollout_fragment_length == 0) - and self.minibatch_size <= self.total_train_batch_size - ) - ): - raise ValueError( - f"`minibatch_size` ({self._minibatch_size}) must either be 'auto' " - "or a multiple of `rollout_fragment_length` " - f"({self.rollout_fragment_length}) while at the same time smaller " - "than or equal to `total_train_batch_size` " - f"({self.total_train_batch_size})!" - ) + #if ( + # self.enable_rl_module_and_learner + # and self._minibatch_size != "auto" + # and not ( + # (self.minibatch_size % self.rollout_fragment_length == 0) + # and self.minibatch_size <= self.total_train_batch_size + # ) + #): + # raise ValueError( + # f"`minibatch_size` ({self._minibatch_size}) must either be 'auto' " + # "or a multiple of `rollout_fragment_length` " + # f"({self.rollout_fragment_length}) while at the same time smaller " + # "than or equal to `total_train_batch_size` " + # f"({self.total_train_batch_size})!" + # ) @property def replay_ratio(self) -> float: @@ -474,19 +459,19 @@ def replay_ratio(self) -> float: """ return (1 / self.replay_proportion) if self.replay_proportion > 0 else 0.0 - @property - def minibatch_size(self): - # If 'auto', use the train_batch_size (meaning each SGD iter is a single pass - # through the entire train batch). Otherwise, use user provided setting. - return ( - ( - self.train_batch_size_per_learner - if self.enable_env_runner_and_connector_v2 - else self.train_batch_size - ) - if self._minibatch_size == "auto" - else self._minibatch_size - ) + #@property + #def minibatch_size(self): + # # If 'auto', use the train_batch_size (meaning each SGD iter is a single pass + # # through the entire train batch). Otherwise, use user provided setting. + # return ( + # ( + # self.train_batch_size_per_learner + # if self.enable_env_runner_and_connector_v2 + # else self.train_batch_size + # ) + # if self._minibatch_size == "auto" + # else self._minibatch_size + # ) @override(AlgorithmConfig) def get_default_learner_class(self): @@ -539,7 +524,7 @@ class IMPALA(Algorithm): 2. If enabled, the replay buffer stores and produces batches of size `rollout_fragment_length * num_envs_per_env_runner`. 3. If enabled, the minibatch ring buffer stores and replays batches of - size `train_batch_size` up to `num_sgd_iter` times per batch. + size `train_batch_size` up to `num_epochs` times per batch. 4. The learner thread executes data parallel SGD across `num_gpus` GPUs on batches of size `train_batch_size`. """ @@ -734,6 +719,9 @@ def training_step(self) -> ResultDict: NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 ), }, + num_epochs=self.config.num_epochs, + minibatch_size=self.config.minibatch_size, + shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch, ) else: learner_results = self.learner_group.update_from_episodes( @@ -745,6 +733,9 @@ def training_step(self) -> ResultDict: NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 ), }, + num_epochs=self.config.num_epochs, + minibatch_size=self.config.minibatch_size, + shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch, ) if not do_async_updates: learner_results = [learner_results] @@ -1292,7 +1283,7 @@ def _learn_on_processed_samples(self) -> ResultDict: ), }, async_update=async_update, - num_iters=self.config.num_sgd_iter, + num_epochs=self.config.num_epochs, minibatch_size=self.config.minibatch_size, ) if not async_update: @@ -1531,7 +1522,7 @@ def make_learner_thread(local_worker, config): lr=config["lr"], train_batch_size=config["train_batch_size"], num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"], - num_sgd_iter=config["num_sgd_iter"], + num_sgd_iter=config["num_epochs"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"], num_data_load_threads=config["num_gpu_loader_threads"], @@ -1540,7 +1531,7 @@ def make_learner_thread(local_worker, config): learner_thread = LearnerThread( local_worker, minibatch_buffer_size=config["minibatch_buffer_size"], - num_sgd_iter=config["num_sgd_iter"], + num_sgd_iter=config["num_epochs"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"], ) diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index 651515666f89..f6f6df0cdb1e 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -93,6 +93,9 @@ def build(self) -> None: in_queue=self._learner_thread_in_queue, out_queue=self._learner_thread_out_queue, metrics_logger=self.metrics, + num_epochs=self.config.num_epochs, + minibatch_size=self.config.minibatch_size, + shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch, ) self._learner_thread.start() @@ -105,8 +108,8 @@ def update_from_episodes( # TODO (sven): Deprecate these in favor of config attributes for only those # algos that actually need (and know how) to do minibatching. minibatch_size: Optional[int] = None, - num_iters: int = 1, - num_total_mini_batches: int = 0, + num_epochs: int = 1, + num_total_minibatches: int = 0, reduce_fn=None, # Deprecated args. **kwargs, ) -> ResultDict: @@ -225,7 +228,17 @@ def _step(self) -> None: class _LearnerThread(threading.Thread): - def __init__(self, *, update_method, in_queue, out_queue, metrics_logger): + def __init__( + self, + *, + update_method, + in_queue, + out_queue, + metrics_logger, + num_epochs, + minibatch_size, + shuffle_batch_per_epoch, + ): super().__init__() self.daemon = True self.metrics: MetricsLogger = metrics_logger @@ -235,6 +248,10 @@ def __init__(self, *, update_method, in_queue, out_queue, metrics_logger): self._in_queue: deque = in_queue self._out_queue: Queue = out_queue + self._num_epochs = num_epochs + self._minibatch_size = minibatch_size + self._shuffle_batch_per_epoch = shuffle_batch_per_epoch + def run(self) -> None: while not self.stopped: self.step() @@ -260,6 +277,9 @@ def step(self): NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 ) }, + num_epochs=self._num_epochs, + minibatch_size=self._minibatch_size, + shuffle_batch_per_epoch=self._shuffle_batch_per_epoch, ) # We have to deepcopy the results dict, b/c we must avoid having a returned # Stats object sit in the queue and getting a new (possibly even tensor) diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py index de9965de8d7d..7dbe8c85566f 100644 --- a/rllib/algorithms/marwil/marwil.py +++ b/rllib/algorithms/marwil/marwil.py @@ -398,7 +398,7 @@ class (multi-/single-learner setup) and evaluation on learner_results = self.learner_group.update_from_batch( batch, minibatch_size=self.config.train_batch_size_per_learner, - num_iters=self.config.dataset_num_iters_per_learner, + num_epochs=self.config.dataset_num_iters_per_learner, ) # Log training results. diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 2fa7c2d0d589..a627b8df1d1d 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -130,6 +130,7 @@ def __init__(self, algo_class=None): self.lr = 5e-5 self.rollout_fragment_length = "auto" self.train_batch_size = 4000 + self.shuffle_batch_per_epoch = True # PPO specific settings: self.use_critic = True @@ -138,11 +139,8 @@ def __init__(self, algo_class=None): self.use_kl_loss = True self.kl_coeff = 0.2 self.kl_target = 0.01 - self.sgd_minibatch_size = 128 - # Simple logic for now: If None, use `train_batch_size`. - self.mini_batch_size_per_learner = None - self.num_sgd_iter = 30 - self.shuffle_single_agent_batch = True + self.minibatch_size = 128 + self.num_epochs = 30 self.vf_loss_coeff = 1.0 self.entropy_coeff = 0.0 self.entropy_coeff_schedule = None @@ -157,6 +155,7 @@ def __init__(self, algo_class=None): # fmt: on # Deprecated keys. + self.sgd_minibatch_size = DEPRECATED_VALUE self.vf_share_layers = DEPRECATED_VALUE self.exploration_config = { @@ -217,10 +216,6 @@ def training( use_kl_loss: Optional[bool] = NotProvided, kl_coeff: Optional[float] = NotProvided, kl_target: Optional[float] = NotProvided, - mini_batch_size_per_learner: Optional[int] = NotProvided, - sgd_minibatch_size: Optional[int] = NotProvided, - num_sgd_iter: Optional[int] = NotProvided, - shuffle_single_agent_batch: Optional[bool] = NotProvided, vf_loss_coeff: Optional[float] = NotProvided, entropy_coeff: Optional[float] = NotProvided, entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, @@ -244,29 +239,6 @@ def training( use_kl_loss: Whether to use the KL-term in the loss function. kl_coeff: Initial coefficient for KL divergence. kl_target: Target value for KL divergence. - mini_batch_size_per_learner: Only use if new API stack is enabled. - The mini batch size per Learner worker. This is the - batch size that each Learner worker's training batch (whose size is - `s`elf.train_batch_size_per_learner`) will be split into. For example, - if the train batch size per Learner worker is 4000 and the mini batch - size per Learner worker is 400, the train batch will be split into 10 - equal sized chunks (or "mini batches"). Each such mini batch will be - used for one SGD update. Overall, the train batch on each Learner - worker will be traversed `self.num_sgd_iter` times. In the above - example, if `self.num_sgd_iter` is 5, we will altogether perform 50 - (10x5) SGD updates per Learner update step. - sgd_minibatch_size: Total SGD batch size across all devices for SGD. - This defines the minibatch size within each epoch. Deprecated on the - new API stack (use `mini_batch_size_per_learner` instead). - num_sgd_iter: Number of SGD iterations in each outer loop (i.e., number of - epochs to execute per train batch). - shuffle_single_agent_batch: Whether to shuffle each single-agent batch once - before a new epoch (which consists of n x minibatches, where n is - `batch_size_per_learner` // `mini_batch_size_per_learner`). This should - be set to True in single-agent and independent multi-agent cases as it - ensures proper mixing of the samples before each batch epoch. Otherwise, - the sequence of minibatches iterated through is the same in each - iteration, possibly impacting learning. vf_loss_coeff: Coefficient of the value function loss. IMPORTANT: you must tune this if you set vf_share_layers=True inside your model's config. entropy_coeff: The entropy coefficient (float) or entropy coefficient @@ -301,14 +273,6 @@ def training( self.kl_coeff = kl_coeff if kl_target is not NotProvided: self.kl_target = kl_target - if mini_batch_size_per_learner is not NotProvided: - self.mini_batch_size_per_learner = mini_batch_size_per_learner - if sgd_minibatch_size is not NotProvided: - self.sgd_minibatch_size = sgd_minibatch_size - if num_sgd_iter is not NotProvided: - self.num_sgd_iter = num_sgd_iter - if shuffle_single_agent_batch is not NotProvided: - self.shuffle_single_agent_batch = shuffle_single_agent_batch if vf_loss_coeff is not NotProvided: self.vf_loss_coeff = vf_loss_coeff if entropy_coeff is not NotProvided: @@ -342,28 +306,28 @@ def validate(self) -> None: self.validate_train_batch_size_vs_rollout_fragment_length() # SGD minibatch size must be smaller than train_batch_size (b/c - # we subsample a batch of `sgd_minibatch_size` from the train-batch for - # each `num_sgd_iter`). + # we subsample a batch of `minibatch_size` from the train-batch for + # each `num_epochs`). if ( not self.enable_rl_module_and_learner - and self.sgd_minibatch_size > self.train_batch_size + and self.minibatch_size > self.train_batch_size ): raise ValueError( - f"`sgd_minibatch_size` ({self.sgd_minibatch_size}) must be <= " + f"`minibatch_size` ({self.minibatch_size}) must be <= " f"`train_batch_size` ({self.train_batch_size}). In PPO, the train batch" - f" will be split into {self.sgd_minibatch_size} chunks, each of which " - f"is iterated over (used for updating the policy) {self.num_sgd_iter} " + f" will be split into {self.minibatch_size} chunks, each of which " + f"is iterated over (used for updating the policy) {self.num_epochs} " "times." ) elif self.enable_rl_module_and_learner: - mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size + mbs = self.minibatch_size tbs = self.train_batch_size_per_learner or self.train_batch_size if isinstance(mbs, int) and isinstance(tbs, int) and mbs > tbs: raise ValueError( - f"`mini_batch_size_per_learner` ({mbs}) must be <= " + f"`minibatch_size` ({mbs}) must be <= " f"`train_batch_size_per_learner` ({tbs}). In PPO, the train batch" f" will be split into {mbs} chunks, each of which is iterated over " - f"(used for updating the policy) {self.num_sgd_iter} times." + f"(used for updating the policy) {self.num_epochs} times." ) # Episodes may only be truncated (and passed into PPO's @@ -495,11 +459,8 @@ def _training_step_new_api_stack(self) -> ResultDict: self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) ), }, - minibatch_size=( - self.config.mini_batch_size_per_learner - or self.config.sgd_minibatch_size - ), - num_iters=self.config.num_sgd_iter, + minibatch_size=self.config.minibatch_size, + num_epochs=self.config.num_epochs, ) self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS) self.metrics.log_dict( @@ -565,14 +526,10 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: # Perform a train step on the collected batch. if self.config.enable_rl_module_and_learner: - mini_batch_size_per_learner = ( - self.config.mini_batch_size_per_learner - or self.config.sgd_minibatch_size - ) train_results = self.learner_group.update_from_batch( batch=train_batch, - minibatch_size=mini_batch_size_per_learner, - num_iters=self.config.num_sgd_iter, + minibatch_size=self.config.minibatch_size, + num_epochs=self.config.num_epochs, ) elif self.config.simple_optimizer: diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index c99bc9c8feac..981473e1432b 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -126,7 +126,7 @@ def test_ppo_compilation_w_connectors(self): config = ( ppo.PPOConfig() .training( - num_sgd_iter=2, + num_epochs=2, # Setup lr schedule for testing. lr_schedule=[[0, 5e-5], [128, 0.0]], # Set entropy_coeff to a faulty value to proof that it'll get @@ -199,8 +199,8 @@ def test_ppo_compilation_and_schedule_mixins(self): entropy_coeff=100.0, entropy_coeff_schedule=[[0, 0.1], [512, 0.0]], train_batch_size=256, - sgd_minibatch_size=128, - num_sgd_iter=2, + minibatch_size=128, + num_epochs=2, model=dict( # Settings in case we use an LSTM. lstm_cell_size=10, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py index 1794c24bb5ba..5166ceb2d34a 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py @@ -73,7 +73,7 @@ def test_ppo_compilation_and_schedule_mixins(self): ) .env_runners(num_env_runners=0) .training( - num_sgd_iter=2, + num_epochs=2, # Setup lr schedule for testing lr-scheduling correctness. lr=[[0, 0.00001], [512, 0.0]], # 512=4x128 # Setup `entropy_coeff` schedule for testing whether it's scheduled diff --git a/rllib/algorithms/ppo/tests/test_repro_ppo.py b/rllib/algorithms/ppo/tests/test_repro_ppo.py index 7d0fdcfaef2f..50dcd7912d5a 100644 --- a/rllib/algorithms/ppo/tests/test_repro_ppo.py +++ b/rllib/algorithms/ppo/tests/test_repro_ppo.py @@ -29,7 +29,7 @@ def test_reproducibility_ppo_cartpole(self): ppo.PPOConfig() .environment(env="DeterministicCartPole-v1", env_config={"seed": 42}) .env_runners(rollout_fragment_length=8) - .training(train_batch_size=64, sgd_minibatch_size=32, num_sgd_iter=2) + .training(train_batch_size=64, minibatch_size=32, num_epochs=2) ) check_reproducibilty( algo_class=ppo.PPO, @@ -47,7 +47,7 @@ def test_reproducibility_ppo_pendulum(self): ppo.PPOConfig() .environment(env="DeterministicPendulum-v1", env_config={"seed": 42}) .env_runners(rollout_fragment_length=8) - .training(train_batch_size=64, sgd_minibatch_size=32, num_sgd_iter=2) + .training(train_batch_size=64, minibatch_size=32, num_epochs=2) ) check_reproducibilty( algo_class=ppo.PPO, diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py index 97b1cda0c9fe..12c98ce50f60 100644 --- a/rllib/algorithms/tests/test_algorithm.py +++ b/rllib/algorithms/tests/test_algorithm.py @@ -54,8 +54,8 @@ def test_add_module_and_remove_module(self): .env_runners(num_cpus_per_env_runner=0.1) .training( train_batch_size=100, - sgd_minibatch_size=50, - num_sgd_iter=1, + minibatch_size=50, + num_epochs=1, ) .rl_module( model_config_dict={ @@ -224,8 +224,8 @@ def test_add_policy_and_remove_policy(self): .env_runners(num_cpus_per_env_runner=0.1) .training( train_batch_size=100, - sgd_minibatch_size=50, - num_sgd_iter=1, + minibatch_size=50, + num_epochs=1, model={ "fcnet_hiddens": [5], "fcnet_activation": "linear", diff --git a/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py b/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py index f88b54347a84..b4c2a7b1b6ce 100644 --- a/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py +++ b/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py @@ -27,7 +27,7 @@ # See the comment before the `algorithms_and_configs` dict. # explore is set to None for PPO in favor of RLModule API support. PPOConfig() - .training(num_sgd_iter=5, train_batch_size=1000) + .training(num_epochs=5, train_batch_size=1000) .env_runners(num_env_runners=2) .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) .evaluation( diff --git a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py index 1dd50fb84035..7b44191ce0c3 100644 --- a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py +++ b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py @@ -54,7 +54,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): .env_runners(rollout_fragment_length=4) .learners(**scaling_config) .environment(MultiAgentCartPole, env_config={"num_agents": num_agents}) - .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) + .training(num_epochs=1, train_batch_size=8, minibatch_size=8) .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) ) return config @@ -190,7 +190,7 @@ def test_e2e_load_rl_module(self): .env_runners(rollout_fragment_length=4) .learners(**scaling_config) .environment("CartPole-v1") - .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) + .training(num_epochs=1, train_batch_size=8, minibatch_size=8) ) env = gym.make("CartPole-v1") # create a multi_rl_module to load and save it to a checkpoint directory diff --git a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py index 3b71c09528bf..19683a89876d 100644 --- a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py +++ b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py @@ -10,7 +10,7 @@ algorithms_and_configs = { - "PPO": (PPOConfig().training(train_batch_size=2, sgd_minibatch_size=2)) + "PPO": (PPOConfig().training(train_batch_size=2, minibatch_size=2)) } diff --git a/rllib/algorithms/tests/test_callbacks_old_stack.py b/rllib/algorithms/tests/test_callbacks_old_stack.py index dcbe2e516733..feef340c41ca 100644 --- a/rllib/algorithms/tests/test_callbacks_old_stack.py +++ b/rllib/algorithms/tests/test_callbacks_old_stack.py @@ -79,7 +79,7 @@ def test_episode_and_sample_callbacks(self): .environment("CartPole-v1") .env_runners(num_env_runners=0) .callbacks(EpisodeAndSampleCallbacks) - .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) + .training(train_batch_size=50, minibatch_size=50, num_epochs=1) ) algo = config.build() algo.train() diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index 2b0ca696edf6..42abf7091841 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -106,8 +106,8 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): .callbacks(EpisodeAndSampleCallbacks) .training( train_batch_size=50, # <- rollout_fragment_length=50 - sgd_minibatch_size=50, - num_sgd_iter=1, + minibatch_size=50, + num_epochs=1, ) ) @@ -158,8 +158,8 @@ def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): .callbacks(EpisodeAndSampleCallbacks) .training( train_batch_size=50, # <- rollout_fragment_length=50 - sgd_minibatch_size=50, - num_sgd_iter=1, + minibatch_size=50, + num_epochs=1, ) ) diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py index 8ae1a2d69102..8e603694a158 100644 --- a/rllib/algorithms/tests/test_worker_failures.py +++ b/rllib/algorithms/tests/test_worker_failures.py @@ -452,8 +452,8 @@ def test_multi_gpu(self): .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training( train_batch_size=10, - sgd_minibatch_size=1, - num_sgd_iter=1, + minibatch_size=1, + num_epochs=1, ) ) @@ -561,7 +561,7 @@ def test_workers_failing_recover(self): ) .training( train_batch_size_per_learner=32, - sgd_minibatch_size=32, + minibatch_size=32, ) .environment( env="fault_env", @@ -620,7 +620,7 @@ def test_modules_are_restored_on_recovered_worker(self): ) .training( train_batch_size_per_learner=32, - sgd_minibatch_size=32, + minibatch_size=32, ) .environment( env="multi_agent_fault_env", @@ -729,7 +729,7 @@ def test_eval_workers_failing_recover(self): ) .training( train_batch_size_per_learner=32, - sgd_minibatch_size=32, + minibatch_size=32, ) .environment(env="fault_env") .evaluation( diff --git a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py index a941f66deff1..fa046b05285d 100644 --- a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py +++ b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py @@ -43,8 +43,8 @@ def main(pargs): vf_clip_param=10.0, entropy_coeff=0.01, train_batch_size=32 if pargs.smoke_test else 16000, - sgd_minibatch_size=1 if pargs.smoke_test else 2000, - num_sgd_iter=1 if pargs.smoke_test else 10, + minibatch_size=1 if pargs.smoke_test else 2000, + num_epochs=1 if pargs.smoke_test else 10, vf_loss_coeff=0.01, clip_param=0.1, lr=0.0001, diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 98b15a5cab18..14590f4b9e94 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -866,7 +866,7 @@ def compute_losses( fwd_out: Output from a call to the `forward_train()` method of the underlying MultiRLModule (`self.module`) during training (`self.update()`). - batch: The training batch that was used to compute `fwd_out`. + batch: The train batch that was used to compute `fwd_out`. Returns: A dictionary mapping module IDs to individual loss terms. @@ -905,7 +905,7 @@ def compute_loss_for_module( Args: module_id: The id of the module. config: The AlgorithmConfig specific to the given `module_id`. - batch: The sample batch for this particular module. + batch: The train batch for this particular module. fwd_out: The output of the forward pass for this particular module. Returns: @@ -925,17 +925,15 @@ def update_from_batch( *, # TODO (sven): Make this a more formal structure with its own type. timesteps: Optional[Dict[str, Any]] = None, - # TODO (sven): Deprecate these in favor of config attributes for only those - # algos that actually need (and know how) to do minibatching. + num_epochs: int = 1, minibatch_size: Optional[int] = None, - num_iters: int = 1, # Deprecated args. - reduce_fn=DEPRECATED_VALUE, + num_iters=DEPRECATED_VALUE, ) -> ResultDict: - """Do `num_iters` minibatch updates given a train batch. + """Run `num_epochs` epochs over the given train batch. You can use this method to take more than one backward pass on the batch. - The same `minibatch_size` and `num_iters` will be used for all module ids in + The same `minibatch_size` and `num_epochs` will be used for all module ids in MultiRLModule. Args: @@ -943,9 +941,12 @@ def update_from_batch( timesteps: Timesteps dict, which must have the key `NUM_ENV_STEPS_SAMPLED_LIFETIME`. # TODO (sven): Make this a more formal structure with its own type. - minibatch_size: The size of the minibatch to use for each update. - num_iters: The number of complete passes over all the sub-batches - in the input multi-agent batch. + num_epochs: The number of complete passes over the entire train batch. Each + pass might be further split into n minibatches (if `minibatch_size` + provided). The train batch is generated from the given `episodes` + through the Learner connector pipeline. + minibatch_size: The size of minibatches to use to further split the train + batch into. Returns: A `ResultDict` object produced by a call to `self.metrics.reduce()`. The @@ -954,21 +955,17 @@ def update_from_batch( Learner) to further reduce these results (for example over n parallel Learners). """ - if reduce_fn != DEPRECATED_VALUE: + if num_iters != DEPRECATED_VALUE: deprecation_warning( - old="Learner.update_from_batch(reduce_fn=..)", - new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., " - "reduce=[mean|min|max|sum], window=..., ema_coeff=...)", - help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger" - " API in your custom Learner methods for logging your custom values " - "and time-reducing (or parallel-reducing) them.", + old="Learner.update_from_episodes(num_iters=...)", + new="Learner.update_from_episodes(num_epochs=...)", error=True, ) return self._update_from_batch_or_episodes( batch=batch, timesteps=timesteps, + num_epochs=num_epochs, minibatch_size=minibatch_size, - num_iters=num_iters, ) def update_from_episodes( @@ -977,18 +974,16 @@ def update_from_episodes( *, # TODO (sven): Make this a more formal structure with its own type. timesteps: Optional[Dict[str, Any]] = None, - # TODO (sven): Deprecate these in favor of config attributes for only those - # algos that actually need (and know how) to do minibatching. + num_epochs: int = 1, minibatch_size: Optional[int] = None, - num_iters: int = 1, - num_total_mini_batches: int = 0, + num_total_minibatches: int = 0, # Deprecated args. - reduce_fn=DEPRECATED_VALUE, + num_iters=DEPRECATED_VALUE, ) -> ResultDict: - """Do `num_iters` minibatch updates given a list of episodes. + """Run `num_epochs` epochs over the train batch generated from `episodes`. You can use this method to take more than one backward pass on the batch. - The same `minibatch_size` and `num_iters` will be used for all module ids in + The same `minibatch_size` and `num_epochs` will be used for all module ids in MultiRLModule. Args: @@ -996,17 +991,20 @@ def update_from_episodes( timesteps: Timesteps dict, which must have the key `NUM_ENV_STEPS_SAMPLED_LIFETIME`. # TODO (sven): Make this a more formal structure with its own type. - minibatch_size: The size of the minibatch to use for each update. - num_iters: The number of complete passes over all the sub-batches - in the input multi-agent batch. - num_total_mini_batches: The total number of mini-batches to loop through - (across all `num_sgd_iter` SGD iterations). It's required to set this - for multi-agent + multi-GPU situations in which the MultiAgentEpisodes + num_epochs: The number of complete passes over the entire train batch. Each + pass might be further split into n minibatches (if `minibatch_size` + provided). The train batch is generated from the given `episodes` + through the Learner connector pipeline. + minibatch_size: The size of minibatches to use to further split the train + batch into. The train batch is generated from the given `episodes` + through the Learner connector pipeline. + num_total_minibatches: The total number of minibatches to loop through + (over all `num_epochs` epochs). It's only required to set this to != 0 + in multi-agent + multi-GPU situations, in which the MultiAgentEpisodes themselves are roughly sharded equally, however, they might contain SingleAgentEpisodes with very lopsided length distributions. Thus, - without this fixed, pre-computed value it can happen that one Learner - goes through a different number of mini-batches than other Learners, - causing a deadlock. + without this fixed, pre-computed value, one Learner might go through a + different number of minibatche passes than others causing a deadlock. Returns: A `ResultDict` object produced by a call to `self.metrics.reduce()`. The @@ -1015,22 +1013,18 @@ def update_from_episodes( Learner) to further reduce these results (for example over n parallel Learners). """ - if reduce_fn != DEPRECATED_VALUE: + if num_iters != DEPRECATED_VALUE: deprecation_warning( - old="Learner.update_from_episodes(reduce_fn=..)", - new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., " - "reduce=[mean|min|max|sum], window=..., ema_coeff=...)", - help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger" - " API in your custom Learner methods for logging your custom values " - "and time-reducing (or parallel-reducing) them.", + old="Learner.update_from_episodes(num_iters=...)", + new="Learner.update_from_episodes(num_epochs=...)", error=True, ) return self._update_from_batch_or_episodes( episodes=episodes, timesteps=timesteps, minibatch_size=minibatch_size, - num_iters=num_iters, - num_total_mini_batches=num_total_mini_batches, + num_epochs=num_epochs, + num_total_minibatches=num_total_minibatches, ) def update_from_iterator( @@ -1043,7 +1037,7 @@ def update_from_iterator( **kwargs, ): self._check_is_built() - minibatch_size = minibatch_size or 32 + #minibatch_size = minibatch_size or 32 # Call `before_gradient_based_update` to allow for non-gradient based # preparations-, logging-, and update logic to happen. @@ -1228,8 +1222,9 @@ def _update_from_batch_or_episodes( # TODO (sven): Deprecate these in favor of config attributes for only those # algos that actually need (and know how) to do minibatching. minibatch_size: Optional[int] = None, - num_iters: int = 1, - num_total_mini_batches: int = 0, + num_epochs: int = 1, + shuffle_batch_per_epoch: bool = True, + num_total_minibatches: int = 0, ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: self._check_is_built() @@ -1296,17 +1291,12 @@ def _update_from_batch_or_episodes( if minibatch_size: if self._learner_connector is not None: - batch_iter = partial( - MiniBatchCyclicIterator, - uses_new_env_runners=True, - num_total_mini_batches=num_total_mini_batches, - shuffle=self.config.shuffle_single_agent_batch, - ) + batch_iter = partial(MiniBatchCyclicIterator, _uses_new_env_runners=True) else: batch_iter = MiniBatchCyclicIterator - elif num_iters > 1: - # `minibatch_size` was not set but `num_iters` > 1. - # Under the old training stack, users could do multiple sgd passes + elif num_epochs > 1: + # `minibatch_size` was not set but `num_epochs` > 1. + # Under the old training stack, users could do multiple epochs # over a batch without specifying a minibatch size. We enable # this behavior here by setting the minibatch size to be the size # of the batch (e.g. 1 minibatch of size batch.count) @@ -1314,7 +1304,7 @@ def _update_from_batch_or_episodes( # Note that there is no need to shuffle here, b/c we don't have minibatches. batch_iter = MiniBatchCyclicIterator else: - # `minibatch_size` and `num_iters` are not set by the user. + # `minibatch_size` and `num_epochs` are not set by the user. batch_iter = MiniBatchDummyIterator # Convert input batch into a tensor batch (MultiAgentBatch) on the correct @@ -1324,7 +1314,13 @@ def _update_from_batch_or_episodes( batch = self._convert_batch_type(batch) batch = self._set_slicing_by_batch_id(batch, value=True) - for tensor_minibatch in batch_iter(batch, minibatch_size, num_iters): + for tensor_minibatch in batch_iter( + batch, + minibatch_size=minibatch_size, + num_epochs=num_epochs, + shuffle_batch_per_epoch=shuffle_batch_per_epoch and (num_epochs > 1), + num_total_minibatches=num_total_minibatches, + ): # Make the actual in-graph/traced `_update` call. This should return # all tensor values (no numpy). fwd_out, loss_per_module, tensor_metrics = self._update( diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index 273af2352031..d746265c9b23 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -221,13 +221,9 @@ def update_from_batch( timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, return_state: bool = False, - # TODO (sven): Deprecate the following args. They should be extracted from the - # self.config of those specific algorithms that actually require these - # settings. + num_epochs: int = 1, + shuffle_batch_per_epoch: bool = False, minibatch_size: Optional[int] = None, - num_iters: int = 1, - # Already deprecated args. - reduce_fn=DEPRECATED_VALUE, # User kwargs. **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: @@ -261,24 +257,13 @@ def update_from_batch( results are reduced, a list of dictionaries of the reduced results from each call to async_update that is ready. """ - if reduce_fn != DEPRECATED_VALUE: - deprecation_warning( - old="LearnerGroup.update_from_batch(reduce_fn=..)", - new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., " - "reduce=[mean|min|max|sum], window=..., ema_coeff=...)", - help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger" - " API in your custom Learner methods for logging and time-reducing any " - "custom metrics. The central `MetricsLogger` instance is available " - "under `self.metrics` within your custom Learner.", - error=True, - ) return self._update( batch=batch, timesteps=timesteps, async_update=async_update, return_state=return_state, minibatch_size=minibatch_size, - num_iters=num_iters, + num_epochs=num_epochs, **kwargs, ) @@ -289,13 +274,9 @@ def update_from_episodes( timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, return_state: bool = False, - # TODO (sven): Deprecate the following args. They should be extracted from the - # self.config of those specific algorithms that actually require these - # settings. + num_epochs: int = 1, + shuffle_batch_per_epoch: bool = False, minibatch_size: Optional[int] = None, - num_iters: int = 1, - # Already deprecated args. - reduce_fn=DEPRECATED_VALUE, # User kwargs. **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: @@ -329,25 +310,13 @@ def update_from_episodes( results are reduced, a list of dictionaries of the reduced results from each call to async_update that is ready. """ - if reduce_fn != DEPRECATED_VALUE: - deprecation_warning( - old="LearnerGroup.update_from_episodes(reduce_fn=..)", - new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., " - "reduce=[mean|min|max|sum], window=..., ema_coeff=...)", - help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger" - " API in your custom Learner methods for logging and time-reducing any " - "custom metrics. The central `MetricsLogger` instance is available " - "under `self.metrics` within your custom Learner.", - error=True, - ) - return self._update( episodes=episodes, timesteps=timesteps, async_update=async_update, return_state=return_state, minibatch_size=minibatch_size, - num_iters=num_iters, + num_epochs=num_epochs, **kwargs, ) @@ -359,11 +328,17 @@ def _update( timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, return_state: bool = False, + num_epochs: int = 1, minibatch_size: Optional[int] = None, - num_iters: int = 1, + shuffle_batch_per_epoch: bool = False, + # Deprecated args. + num_iters=DEPRECATED_VALUE, **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: + if num_iters != DEPRECATED_VALUE: + deprecation_warning(old="num_iters", new="num_epochs", error=True) + # Define function to be called on all Learner actors (or the local learner). def _learner_update( _learner: Learner, @@ -372,7 +347,7 @@ def _learner_update( _episodes_shard=None, _timesteps=None, _return_state=False, - _num_total_mini_batches=0, + _num_total_minibatches=0, **_kwargs, ): # If the batch shard is an `DataIterator` we have an offline @@ -383,7 +358,7 @@ def _learner_update( iterator=_batch_shard, timesteps=_timesteps, minibatch_size=minibatch_size, - num_iters=num_iters, + num_epochs=num_epochs, **_kwargs, ) elif _batch_shard is not None: @@ -391,7 +366,7 @@ def _learner_update( batch=_batch_shard, timesteps=_timesteps, minibatch_size=minibatch_size, - num_iters=num_iters, + num_epochs=num_epochs, **_kwargs, ) else: @@ -399,8 +374,8 @@ def _learner_update( episodes=_episodes_shard, timesteps=_timesteps, minibatch_size=minibatch_size, - num_iters=num_iters, - num_total_mini_batches=_num_total_mini_batches, + num_epochs=num_epochs, + num_total_minibatches=_num_total_minibatches, **_kwargs, ) if _return_state: @@ -485,13 +460,13 @@ def _learner_update( from ray.data.iterator import DataIterator if isinstance(episodes[0], DataIterator): - num_total_mini_batches = 0 + num_total_minibatches = 0 partials = [ partial( _learner_update, _episodes_shard=episodes_shard, _timesteps=timesteps, - _num_total_mini_batches=num_total_mini_batches, + _num_total_minibatches=num_total_minibatches, ) for episodes_shard in episodes ] @@ -506,20 +481,20 @@ def _learner_update( # In the multi-agent case AND `minibatch_size` AND num_workers # > 1, we compute a max iteration counter such that the different # Learners will not go through a different number of iterations. - num_total_mini_batches = 0 + num_total_minibatches = 0 if minibatch_size and len(self._workers) > 1: - num_total_mini_batches = self._compute_num_total_mini_batches( + num_total_minibatches = self._compute_num_total_minibatches( episodes, len(self._workers), minibatch_size, - num_iters, + num_epochs, ) partials = [ partial( _learner_update, _episodes_shard=eps_shard, _timesteps=timesteps, - _num_total_mini_batches=num_total_mini_batches, + _num_total_minibatches=num_total_minibatches, ) for eps_shard in eps_shards ] @@ -934,11 +909,11 @@ def __del__(self): self.shutdown() @staticmethod - def _compute_num_total_mini_batches( + def _compute_num_total_minibatches( episodes, num_shards, - mini_batch_size, - num_iters, + minibatch_size, + num_epochs, ): # Count total number of timesteps per module ID. if isinstance(episodes[0], MultiAgentEpisode): @@ -950,7 +925,7 @@ def _compute_num_total_mini_batches( else: max_ts = sum(map(len, episodes)) - return int((num_iters * max_ts) / (num_shards * mini_batch_size)) + return int((num_epochs * max_ts) / (num_shards * minibatch_size)) @Deprecated(new="LearnerGroup.update_from_batch(async=False)", error=False) def update(self, *args, **kwargs): diff --git a/rllib/env/tests/test_multi_agent_env.py b/rllib/env/tests/test_multi_agent_env.py index 2646c24c41ac..707effbaa4ce 100644 --- a/rllib/env/tests/test_multi_agent_env.py +++ b/rllib/env/tests/test_multi_agent_env.py @@ -598,7 +598,7 @@ def test_multi_agent_with_flex_agents(self): .environment("flex_agents_multi_agent") .env_runners(num_env_runners=0) .framework("tf") - .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) + .training(train_batch_size=50, minibatch_size=50, num_epochs=1) ) algo = config.build() for i in range(10): @@ -863,7 +863,7 @@ def gen_policy(): ), ) .framework("tf") - .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) + .training(train_batch_size=50, minibatch_size=50, num_epochs=1) ) algo = config.build() diff --git a/rllib/evaluation/tests/test_envs_that_crash.py b/rllib/evaluation/tests/test_envs_that_crash.py index 573925b35d6b..cef94ecbd7dd 100644 --- a/rllib/evaluation/tests/test_envs_that_crash.py +++ b/rllib/evaluation/tests/test_envs_that_crash.py @@ -109,7 +109,7 @@ def test_env_crash_on_one_worker_during_sampling_but_recreate_worker(self): recreate_failed_env_runners=True, delay_between_env_runner_restarts_s=0, ) - .training(train_batch_size=60, sgd_minibatch_size=60) + .training(train_batch_size=60, minibatch_size=60) .environment( env=CartPoleCrashing, env_config={ diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index 4f8ed097170c..145f4695f849 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -208,7 +208,7 @@ def test_query_evaluators(self): num_envs_per_env_runner=2, create_env_on_local_worker=True, ) - .training(train_batch_size=20, sgd_minibatch_size=5, num_sgd_iter=1) + .training(train_batch_size=20, minibatch_size=5, num_epochs=1) ) algo = config.build() results = algo.env_runner_group.foreach_worker( diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 457abba37f63..dab76f73cf56 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -290,7 +290,7 @@ def test_counting_by_agent_steps(self): # Env setup. .environment(MultiAgentPendulum, env_config={"num_agents": num_agents}) .env_runners(num_env_runners=2, rollout_fragment_length=21) - .training(num_sgd_iter=2, train_batch_size=168) + .training(num_epochs=2, train_batch_size=168) .framework("torch") .multi_agent( policies={f"p{i}" for i in range(num_agents)}, diff --git a/rllib/examples/actions/nested_action_spaces.py b/rllib/examples/actions/nested_action_spaces.py index db7ad434c674..bb8c3dbf4e71 100644 --- a/rllib/examples/actions/nested_action_spaces.py +++ b/rllib/examples/actions/nested_action_spaces.py @@ -84,7 +84,7 @@ def _env_to_module_pipeline(env): base_config.training( # We don't want high entropy in this Env. entropy_coeff=0.00005, - num_sgd_iter=4, + num_epochs=4, vf_loss_coeff=0.01, ) diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index 95dc0ae26c24..2f31c3e95297 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -178,7 +178,7 @@ def training_step(self) -> ResultDict: None, # Provide entire AlgorithmConfig object, not just an override. PPOConfig() - .training(num_sgd_iter=10, sgd_minibatch_size=128) + .training(num_epochs=10, minibatch_size=128) .framework("torch" if args.torch or args.mixed_torch_tf else "tf"), ), "dqn_policy": ( diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py index a154a73f088a..c7454161ab06 100644 --- a/rllib/examples/cartpole_lstm.py +++ b/rllib/examples/cartpole_lstm.py @@ -67,7 +67,7 @@ ) if args.run == "PPO": - config.training(num_sgd_iter=5, vf_loss_coeff=0.0001, train_batch_size=512) + config.training(num_epochs=5, vf_loss_coeff=0.0001, train_batch_size=512) config.model["vf_share_layers"] = True elif args.run == "IMPALA": config.env_runners(num_env_runners=2) diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index 119d9f6442ef..93d85bcd7633 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -53,7 +53,7 @@ def _get_encoder_config( # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! - .training(train_batch_size=32, sgd_minibatch_size=16, num_sgd_iter=1) + .training(train_batch_size=32, minibatch_size=16, num_epochs=1) ) # CartPole's observation space is not compatible with our MobileNetV2 Encoder, so diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py index e38b46309dc1..9da050fe0b62 100644 --- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -96,7 +96,7 @@ .environment("env") .training( train_batch_size_per_learner=512, - mini_batch_size_per_learner=64, + minibatch_size=64, lambda_=0.1, gamma=0.95, lr=0.0003, diff --git a/rllib/examples/connectors/flatten_observations_dict_space.py b/rllib/examples/connectors/flatten_observations_dict_space.py index 6958c9c27cd2..a6af2f16c19e 100644 --- a/rllib/examples/connectors/flatten_observations_dict_space.py +++ b/rllib/examples/connectors/flatten_observations_dict_space.py @@ -142,7 +142,7 @@ def _env_to_module_pipeline(env): # PPO-specific settings (for better learning behavior only). if args.algo == "PPO": base_config.training( - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, ) # IMPALA-specific settings (for better learning behavior only). diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py index 52a4f4c352b1..f0dcea0b9c43 100644 --- a/rllib/examples/connectors/frame_stacking.py +++ b/rllib/examples/connectors/frame_stacking.py @@ -212,8 +212,8 @@ def _env_creator(cfg): # PPO specific settings. if args.algo == "PPO": base_config.training( - num_sgd_iter=10, - mini_batch_size_per_learner=64, + num_epochs=10, + minibatch_size=64, lambda_=0.95, kl_coeff=0.5, clip_param=0.1, diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py index 2fec8f3c63d0..75c373333677 100644 --- a/rllib/examples/connectors/mean_std_filtering.py +++ b/rllib/examples/connectors/mean_std_filtering.py @@ -183,7 +183,7 @@ def observation(self, observation): # PPO specific settings. if args.algo == "PPO": base_config.training( - mini_batch_size_per_learner=64, + minibatch_size=64, lambda_=0.1, vf_clip_param=10.0, ) diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index dcee6ac5689e..a7b0bc056218 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -141,7 +141,7 @@ def _env_to_module(env): .environment("env") .env_runners(env_to_module_connector=_env_to_module) .training( - num_sgd_iter=6, + num_epochs=6, lr=0.0003, train_batch_size=4000, vf_loss_coeff=0.01, diff --git a/rllib/examples/curiosity/count_based_curiosity.py b/rllib/examples/curiosity/count_based_curiosity.py index 90f69a513ac9..7b9b4b83d500 100644 --- a/rllib/examples/curiosity/count_based_curiosity.py +++ b/rllib/examples/curiosity/count_based_curiosity.py @@ -127,7 +127,7 @@ learner_connector=( None if args.no_curiosity else lambda *ags, **kw: CountBasedCuriosity() ), - num_sgd_iter=10, + num_epochs=10, vf_loss_coeff=0.01, ) .rl_module(model_config_dict={"vf_share_layers": True}) diff --git a/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py b/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py index 9aab5a31a4ad..5809c3c9a420 100644 --- a/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py +++ b/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py @@ -270,7 +270,7 @@ def on_sample_end( # Set PPO-specific hyper-parameters. if args.algo == "PPO": base_config.training( - num_sgd_iter=6, + num_epochs=6, # Plug in the correct Learner class. learner_class=PPOTorchLearnerWithCuriosity, train_batch_size_per_learner=2000, diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index a6f0e9fb2d26..7a7cd6cc1d41 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -218,7 +218,7 @@ def on_train_result( env_to_module_connector=lambda env: FlattenObservations(), ) .training( - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, lr=0.0002, ) diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index 6bbf538e025c..5ef6ee1a0167 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -65,7 +65,7 @@ if args.run == "PPO": # Simplify to run this example script faster. - config.training(sgd_minibatch_size=10, num_sgd_iter=5) + config.training(minibatch_size=10, num_epochs=5) stop = {TRAINING_ITERATION: args.stop_iters} diff --git a/rllib/examples/envs/env_rendering_and_recording.py b/rllib/examples/envs/env_rendering_and_recording.py index d910ac92fc57..903056288f4e 100644 --- a/rllib/examples/envs/env_rendering_and_recording.py +++ b/rllib/examples/envs/env_rendering_and_recording.py @@ -264,7 +264,7 @@ def _env_creator(cfg): clip_param=0.1, vf_clip_param=10.0, entropy_coeff=0.01, - num_sgd_iter=10, + num_epochs=10, # Linearly adjust learning rate based on number of GPUs. lr=0.00015 * (args.num_gpus or 1), grad_clip=100.0, diff --git a/rllib/examples/envs/greyscale_env.py b/rllib/examples/envs/greyscale_env.py index 5af971ad23fb..2f0e5ffc9560 100644 --- a/rllib/examples/envs/greyscale_env.py +++ b/rllib/examples/envs/greyscale_env.py @@ -101,11 +101,11 @@ def env_creator(config): vf_loss_coeff=0.1, clip_param=0.1, vf_clip_param=10.0, - num_sgd_iter=10, + num_epochs=10, kl_coeff=0.5, lr=0.0001, grad_clip=100, - sgd_minibatch_size=500, + minibatch_size=500, train_batch_size=5000 if not args.as_test else 1000, model={"vf_share_layers": True}, ) diff --git a/rllib/examples/envs/unity3d_env_local.py b/rllib/examples/envs/unity3d_env_local.py index 40350a8c5853..d334125ee4e8 100644 --- a/rllib/examples/envs/unity3d_env_local.py +++ b/rllib/examples/envs/unity3d_env_local.py @@ -145,9 +145,9 @@ lr=0.0003, lambda_=0.95, gamma=0.99, - sgd_minibatch_size=256, + minibatch_size=256, train_batch_size=4000, - num_sgd_iter=20, + num_epochs=20, clip_param=0.2, model={"fcnet_hiddens": [512, 512]}, ) diff --git a/rllib/examples/gpus/float16_training_and_inference.py b/rllib/examples/gpus/float16_training_and_inference.py index 169481b849bb..aa498663b2d6 100644 --- a/rllib/examples/gpus/float16_training_and_inference.py +++ b/rllib/examples/gpus/float16_training_and_inference.py @@ -249,7 +249,7 @@ def configure_optimizers_for_module(self, module_id, config): # Typical CartPole-v1 hyperparams known to work well: gamma=0.99, lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, use_kl_loss=True, ) diff --git a/rllib/examples/gpus/mixed_precision_training_float16_inference.py b/rllib/examples/gpus/mixed_precision_training_float16_inference.py index 56d4fb171208..e27dd8b7b579 100644 --- a/rllib/examples/gpus/mixed_precision_training_float16_inference.py +++ b/rllib/examples/gpus/mixed_precision_training_float16_inference.py @@ -169,7 +169,7 @@ def _update(self, *args, **kwargs): # Typical CartPole-v1 hyperparams known to work well: gamma=0.99, lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, use_kl_loss=True, ) diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py index 0e092680b390..8391ca272704 100644 --- a/rllib/examples/inference/policy_inference_after_training_w_connector.py +++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py @@ -151,7 +151,7 @@ def _env_creator(cfg): get_trainable_cls(args.algo) .get_default_config() .training( - num_sgd_iter=6, + num_epochs=6, lr=0.0003, vf_loss_coeff=0.01, ) diff --git a/rllib/examples/learners/custom_loss_fn_simple.py b/rllib/examples/learners/custom_loss_fn_simple.py index 2cf94790c94a..151406330502 100644 --- a/rllib/examples/learners/custom_loss_fn_simple.py +++ b/rllib/examples/learners/custom_loss_fn_simple.py @@ -128,7 +128,7 @@ class for details on how to override the main (PPO) loss function. # `self.config.learner_config_dict['regularizer_coeff']` learner_config_dict={"regularizer_coeff": args.regularizer_coeff}, # Some settings to make this example learn better. - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, # The learning rate, settable through the command line `--lr` arg. lr=args.lr, diff --git a/rllib/examples/learners/separate_vf_lr_and_optimizer.py b/rllib/examples/learners/separate_vf_lr_and_optimizer.py index b8d21db87f13..93f03e6101c5 100644 --- a/rllib/examples/learners/separate_vf_lr_and_optimizer.py +++ b/rllib/examples/learners/separate_vf_lr_and_optimizer.py @@ -117,7 +117,7 @@ class for details on how to override the main (torch) `configure_optimizers_for_ # `self.config.learner_config_dict['lr_vf']` learner_config_dict={"lr_vf": args.lr_vf}, # Some settings to make this example learn better. - num_sgd_iter=6, + num_epochs=6, # Since we are using separate optimizers for the two NN components, the # value of `vf_loss_coeff` does not matter anymore. We set this to 1.0 here. vf_loss_coeff=1.0, diff --git a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py index 2a5a2baae730..d12ccd3eedbf 100644 --- a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py +++ b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py @@ -125,7 +125,7 @@ def train_ppo_agent_from_checkpointed_module( .training( lr=0.0001, gamma=0.99, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, ) ) diff --git a/rllib/examples/multi_agent/multi_agent_pendulum.py b/rllib/examples/multi_agent/multi_agent_pendulum.py index 80aa2441692e..74ed6045673e 100644 --- a/rllib/examples/multi_agent/multi_agent_pendulum.py +++ b/rllib/examples/multi_agent/multi_agent_pendulum.py @@ -49,7 +49,7 @@ .environment("env" if args.num_agents > 0 else "Pendulum-v1") .training( train_batch_size_per_learner=512, - mini_batch_size_per_learner=64, + minibatch_size=64, lambda_=0.1, gamma=0.95, lr=0.0003, diff --git a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py index c4fe7e30e814..d1670c3be9c9 100644 --- a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py @@ -185,7 +185,7 @@ def _get_multi_agent(): num_cpus_for_main_process=1, ) .training( - num_sgd_iter=20, + num_epochs=20, model=dict( **({"uses_new_env_runners": True} if args.enable_new_api_stack else {}), ), diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py index 3c01d25a244c..7420e2604790 100644 --- a/rllib/examples/multi_agent/self_play_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -173,9 +173,9 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): ) ) - # Only for PPO, change the `num_sgd_iter` setting. + # Only for PPO, change the `num_epochs` setting. if args.algo == "PPO": - config.training(num_sgd_iter=20) + config.training(num_epochs=20) stop = { NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, diff --git a/rllib/examples/multi_agent/two_algorithms.py b/rllib/examples/multi_agent/two_algorithms.py index f77c6d0d5c3b..21169110cf7d 100644 --- a/rllib/examples/multi_agent/two_algorithms.py +++ b/rllib/examples/multi_agent/two_algorithms.py @@ -95,7 +95,7 @@ def select_policy(algorithm, framework): .training( model={"vf_share_layers": True}, vf_loss_coeff=0.01, - num_sgd_iter=6, + num_epochs=6, ) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 4a93fcdbef6c..4afeced1e25d 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -104,7 +104,7 @@ def create_quadx_waypoints_env(env_config): } ) config.training( - sgd_minibatch_size=128, + minibatch_size=128, train_batch_size_per_learner=10000, ) # If IMPALA set additional arguments. diff --git a/rllib/examples/rl_modules/classes/lstm_containing_rlm.py b/rllib/examples/rl_modules/classes/lstm_containing_rlm.py index 993df559301b..c2dd1c230d2d 100644 --- a/rllib/examples/rl_modules/classes/lstm_containing_rlm.py +++ b/rllib/examples/rl_modules/classes/lstm_containing_rlm.py @@ -13,7 +13,7 @@ torch, nn = try_import_torch() -class LSTMContainingRLModule(TorchRLModule): +class LSTMContainingRLModule(TorchRLModule, ValueFunctionAPI): """An example TorchRLModule that contains an LSTM layer. .. testcode:: diff --git a/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/rllib/examples/rl_modules/classes/mobilenet_rlm.py index 49878ec555f9..7cd87d5b8922 100644 --- a/rllib/examples/rl_modules/classes/mobilenet_rlm.py +++ b/rllib/examples/rl_modules/classes/mobilenet_rlm.py @@ -75,7 +75,7 @@ def setup(self): # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! - .training(train_batch_size=32, sgd_minibatch_size=16, num_sgd_iter=1) + .training(train_batch_size=32, minibatch_size=16, num_epochs=1) ) config.build().train() diff --git a/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py b/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py index 6089583a31b3..88ea754b7217 100644 --- a/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py +++ b/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py @@ -17,7 +17,7 @@ torch, nn = try_import_torch() -class TinyAtariCNN(TorchRLModule): +class TinyAtariCNN(TorchRLModule, ValueFunctionAPI): """A tiny CNN stack for fast-learning of Atari envs. The architecture here is the exact same as the one used by the old API stack as diff --git a/rllib/examples/rl_modules/custom_lstm_rl_module.py b/rllib/examples/rl_modules/custom_lstm_rl_module.py index 3d3cf285eb19..14285d16e5b6 100644 --- a/rllib/examples/rl_modules/custom_lstm_rl_module.py +++ b/rllib/examples/rl_modules/custom_lstm_rl_module.py @@ -80,7 +80,7 @@ ) .training( train_batch_size_per_learner=1024, - num_sgd_iter=6, + num_epochs=6, lr=0.0009, vf_loss_coeff=0.001, entropy_coeff=0.0, diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py index b78453d3e9d0..bf930a00f5e2 100644 --- a/rllib/execution/train_ops.py +++ b/rllib/execution/train_ops.py @@ -45,14 +45,14 @@ def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict: config = algorithm.config workers = algorithm.env_runner_group local_worker = workers.local_env_runner - num_sgd_iter = config.get("num_sgd_iter", 1) - sgd_minibatch_size = config.get("sgd_minibatch_size", 0) + num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1)) + minibatch_size = config.get("minibatch_size", config.get("sgd_minibatch_size", 0)) learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER] with learn_timer: - # Subsample minibatches (size=`sgd_minibatch_size`) from the + # Subsample minibatches (size=`minibatch_size`) from the # train batch and loop through train batch `num_sgd_iter` times. - if num_sgd_iter > 1 or sgd_minibatch_size > 0: + if num_sgd_iter > 1 or minibatch_size > 0: info = do_minibatch_sgd( train_batch, { @@ -62,7 +62,7 @@ def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict: }, local_worker, num_sgd_iter, - sgd_minibatch_size, + minibatch_size, [], ) # Single update step using train batch. @@ -114,15 +114,15 @@ def multi_gpu_train_one_step(algorithm, train_batch) -> Dict: config = algorithm.config workers = algorithm.env_runner_group local_worker = workers.local_env_runner - num_sgd_iter = config.get("num_sgd_iter", 1) - sgd_minibatch_size = config.get("sgd_minibatch_size", config["train_batch_size"]) + num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1)) + minibatch_size = config.get("minibatch_size", config["train_batch_size"]) # Determine the number of devices (GPUs or 1 CPU) we use. num_devices = int(math.ceil(config["num_gpus"] or 1)) # Make sure total batch size is dividable by the number of devices. # Batch size per tower. - per_device_batch_size = sgd_minibatch_size // num_devices + per_device_batch_size = minibatch_size // num_devices # Total batch size. batch_size = per_device_batch_size * num_devices assert batch_size % num_devices == 0 diff --git a/rllib/models/tests/test_attention_nets.py b/rllib/models/tests/test_attention_nets.py index 1ccc216aec3c..bed5ad726fbc 100644 --- a/rllib/models/tests/test_attention_nets.py +++ b/rllib/models/tests/test_attention_nets.py @@ -68,9 +68,9 @@ def test_attention_nets_w_prev_actions_and_prev_rewards(self): "attention_use_n_prev_actions": 3, "attention_use_n_prev_rewards": 2, }, - "num_sgd_iter": 1, + "num_epochs": 1, "train_batch_size": 200, - "sgd_minibatch_size": 50, + "minibatch_size": 50, "rollout_fragment_length": 100, "num_env_runners": 1, } @@ -88,7 +88,7 @@ def test_ppo_attention_net_learning(self): "num_env_runners": 0, "entropy_coeff": 0.001, "vf_loss_coeff": 1e-5, - "num_sgd_iter": 5, + "num_epochs": 5, "model": { "custom_model": "attention_net", "max_seq_len": 10, diff --git a/rllib/models/tests/test_lstms.py b/rllib/models/tests/test_lstms.py index c8d204b395e5..b49e0db2628f 100644 --- a/rllib/models/tests/test_lstms.py +++ b/rllib/models/tests/test_lstms.py @@ -49,9 +49,9 @@ def test_lstm_w_prev_action_and_prev_reward(self): "lstm_use_prev_action": True, "lstm_use_prev_reward": True, }, - num_sgd_iter=1, + num_epochs=1, train_batch_size=200, - sgd_minibatch_size=50, + minibatch_size=50, ) .env_runners( rollout_fragment_length=100, diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index 64b0836caec6..aa5e5f3758d2 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -52,8 +52,8 @@ def test_rlms_and_preprocessing(self): .env_runners(num_env_runners=0) .training( train_batch_size=10, - sgd_minibatch_size=1, - num_sgd_iter=1, + minibatch_size=1, + num_epochs=1, ) # Set this to True to enforce no preprocessors being used. .experimental(_disable_preprocessor_api=True) @@ -90,7 +90,7 @@ def test_preprocessing_disabled_modelv2(self): ) # Speed things up a little. .env_runners(rollout_fragment_length=5) - .training(train_batch_size=100, sgd_minibatch_size=10, num_sgd_iter=1) + .training(train_batch_size=100, minibatch_size=10, num_epochs=1) .debugging(seed=42) # Set this to True to enforce no preprocessors being used. # Complex observations now arrive directly in the model as diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index edda8c818b5f..efd7b4024131 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -618,7 +618,8 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): # Get the correct slice of the already loaded batch to use, # based on offset and batch size. batch_size = self.config.get( - "sgd_minibatch_size", self.config["train_batch_size"] + "minibatch_size", + self.config.get("sgd_minibatch_size", self.config["train_batch_size"]), ) if batch_size >= len(self._loaded_single_cpu_batch): sliced_batch = self._loaded_single_cpu_batch @@ -972,7 +973,7 @@ def __init__( self.max_per_device_batch_size = ( max_per_device_batch_size or policy.config.get( - "sgd_minibatch_size", policy.config.get("train_batch_size", 999999) + "minibatch_size", policy.config.get("train_batch_size", 999999) ) ) // len(self.devices) input_placeholders = tree.flatten(self.policy._loss_input_dict_no_rnn) @@ -1181,7 +1182,7 @@ def load_data(self, sess, inputs, state_inputs, num_grad_updates=None): if sequences_per_minibatch < len(self.devices): raise ValueError( "Must load at least 1 tuple sequence per device. Try " - "increasing `sgd_minibatch_size` or reducing `max_seq_len` " + "increasing `minibatch_size` or reducing `max_seq_len` " "to ensure that at least one sequence fits per device." ) self._loaded_per_device_batch_size = ( diff --git a/rllib/policy/dynamic_tf_policy_v2.py b/rllib/policy/dynamic_tf_policy_v2.py index 1f1d41aa1760..f11cba1ee57d 100644 --- a/rllib/policy/dynamic_tf_policy_v2.py +++ b/rllib/policy/dynamic_tf_policy_v2.py @@ -1004,8 +1004,10 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): # Get the correct slice of the already loaded batch to use, # based on offset and batch size. batch_size = self.config.get( - "sgd_minibatch_size", self.config["train_batch_size"] + "minibatch_size", + self.config.get("sgd_minibatch_size", self.config["train_batch_size"]), ) + if batch_size >= len(self._loaded_single_cpu_batch): sliced_batch = self._loaded_single_cpu_batch else: diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 6d53b78da360..e4db6d37a5c0 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -549,7 +549,7 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): # Get the correct slice of the already loaded batch to use, # based on offset and batch size. device_batch_size = self.config.get( - "sgd_minibatch_size", self.config["train_batch_size"] + "minibatch_size", self.config["train_batch_size"] ) // len(self.devices) # Set Model to train mode. diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index a86236108ac1..649fc19f88e3 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -839,7 +839,8 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): # Get the correct slice of the already loaded batch to use, # based on offset and batch size. device_batch_size = self.config.get( - "sgd_minibatch_size", self.config["train_batch_size"] + "minibatch_size", + self.config.get("sgd_minibatch_size", self.config["train_batch_size"]), ) // len(self.devices) # Set Model to train mode. diff --git a/rllib/tests/backward_compat/checkpoints/create_checkpoints.py b/rllib/tests/backward_compat/checkpoints/create_checkpoints.py index 952d299d385f..d66bcd1f87cf 100644 --- a/rllib/tests/backward_compat/checkpoints/create_checkpoints.py +++ b/rllib/tests/backward_compat/checkpoints/create_checkpoints.py @@ -9,7 +9,7 @@ PPOConfig() .environment("FrozenLake-v1") .training( - num_sgd_iter=2, + num_epochs=2, model=dict( fcnet_hiddens=[10], ), diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py index 0fe968a2ae61..4207336bc99a 100644 --- a/rllib/tests/test_io.py +++ b/rllib/tests/test_io.py @@ -195,7 +195,7 @@ def test_agent_input_list(self): config = ( PPOConfig() .environment("CartPole-v1") - .training(train_batch_size=98, sgd_minibatch_size=49) + .training(train_batch_size=98, minibatch_size=49) .evaluation(off_policy_estimation_methods={}) ) diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py index d93951be0f67..eda9a0c3e440 100644 --- a/rllib/tests/test_lstm.py +++ b/rllib/tests/test_lstm.py @@ -183,8 +183,8 @@ def test_simple_optimizer_sequencing(self): .env_runners(num_env_runners=0, rollout_fragment_length=10) .training( train_batch_size=10, - sgd_minibatch_size=10, - num_sgd_iter=1, + minibatch_size=10, + num_epochs=1, model={ "custom_model": "rnn", "max_seq_len": 4, @@ -254,8 +254,8 @@ def test_minibatch_sequencing(self): .env_runners(num_env_runners=0, rollout_fragment_length=20) .training( train_batch_size=20, - sgd_minibatch_size=10, - num_sgd_iter=1, + minibatch_size=10, + num_epochs=1, model={ "custom_model": "rnn", "max_seq_len": 4, diff --git a/rllib/tests/test_nested_observation_spaces.py b/rllib/tests/test_nested_observation_spaces.py index b4d236341f71..402cf859b8cf 100644 --- a/rllib/tests/test_nested_observation_spaces.py +++ b/rllib/tests/test_nested_observation_spaces.py @@ -399,8 +399,8 @@ def test_torch_model(self): .env_runners(num_env_runners=0, rollout_fragment_length=5) .training( train_batch_size=5, - sgd_minibatch_size=5, - num_sgd_iter=1, + minibatch_size=5, + num_epochs=1, model={"custom_model": "composite"}, ) ) @@ -441,8 +441,8 @@ def test_torch_repeated(self): .env_runners(num_env_runners=0, rollout_fragment_length=5) .training( train_batch_size=5, - num_sgd_iter=1, - sgd_minibatch_size=5, + num_epochs=1, + minibatch_size=5, model={"custom_model": "r1"}, ) ) diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py index 469dba2ea790..a6e8c52ae76c 100644 --- a/rllib/tests/test_supported_multi_agent.py +++ b/rllib/tests/test_supported_multi_agent.py @@ -65,7 +65,7 @@ def test_ppo_multiagent(self): ( PPOConfig() .env_runners(num_env_runners=1, rollout_fragment_length=10) - .training(num_sgd_iter=1, train_batch_size=10, sgd_minibatch_size=1) + .training(num_epochs=1, train_batch_size=10, minibatch_size=1) ), ) diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 58b8d50a4150..765cea010f3d 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -74,8 +74,8 @@ def test_ppo(self): .env_runners(num_env_runners=2, rollout_fragment_length=50) .training( train_batch_size=100, - num_sgd_iter=1, - sgd_minibatch_size=50, + num_epochs=1, + minibatch_size=50, model={ "fcnet_hiddens": [10], }, @@ -103,8 +103,8 @@ def test_ppo_no_preprocessors_gpu(self): .env_runners(num_env_runners=2, rollout_fragment_length=50) .training( train_batch_size=100, - num_sgd_iter=1, - sgd_minibatch_size=50, + num_epochs=1, + minibatch_size=50, model={ "fcnet_hiddens": [10], }, diff --git a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py index c0a9d18eed8b..f75c42912134 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py +++ b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py @@ -23,7 +23,7 @@ _separate_vf_optimizer=True, # Separate learning rate (and schedule) for the value function branch. _lr_vf=tune.grid_search([0.00075, [[0, 0.00075], [100000, 0.0003]]]), - num_sgd_iter=6, + num_epochs=6, # `vf_loss_coeff` will be ignored anyways as we use separate loss terms. vf_loss_coeff=0.01, vtrace=True, diff --git a/rllib/tuned_examples/appo/cartpole-appo.yaml b/rllib/tuned_examples/appo/cartpole-appo.yaml index 03cd464cf495..bfceaddcf02f 100644 --- a/rllib/tuned_examples/appo/cartpole-appo.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo.yaml @@ -12,7 +12,7 @@ cartpole-appo: num_env_runners: 4 num_gpus: 0 observation_filter: MeanStdFilter - num_sgd_iter: 1 + num_epochs: 1 vf_loss_coeff: 0.01 vtrace: true model: diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py index b84e7d2b6cf9..865c4ce85c31 100644 --- a/rllib/tuned_examples/appo/cartpole_appo.py +++ b/rllib/tuned_examples/appo/cartpole_appo.py @@ -6,7 +6,10 @@ ) from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args() +parser = add_rllib_example_script_args( + default_reward=450.0, + default_timesteps=2000000, +) parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. @@ -22,24 +25,24 @@ ) .environment("CartPole-v1") .training( + train_batch_size_per_learner=1000, vf_loss_coeff=0.05, - entropy_coeff=0.0, + entropy_coeff=0.01, + num_epochs=2, + lr=0.00075, + minibatch_size=250, ) .rl_module( model_config_dict={ - "vf_share_layers": True, + "fcnet_hiddens": [32], + #"vf_share_layers": True, "uses_new_env_runners": True, }, ) ) -stop = { - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 450.0, - NUM_ENV_STEPS_SAMPLED_LIFETIME: 2000000, -} - if __name__ == "__main__": from ray.rllib.utils.test_utils import run_rllib_example_script_experiment - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment(config, args) diff --git a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml index c8f5d37cb971..5af435924178 100644 --- a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml +++ b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml @@ -29,5 +29,5 @@ frozenlake-appo-vtrace: num_envs_per_env_runner: 5 num_env_runners: 4 num_gpus: 0 - num_sgd_iter: 1 + num_epochs: 1 vf_loss_coeff: 0.01 diff --git a/rllib/tuned_examples/appo/halfcheetah-appo.yaml b/rllib/tuned_examples/appo/halfcheetah-appo.yaml index 0102b15d999b..169e4e82b184 100644 --- a/rllib/tuned_examples/appo/halfcheetah-appo.yaml +++ b/rllib/tuned_examples/appo/halfcheetah-appo.yaml @@ -21,7 +21,7 @@ halfcheetah-appo: num_multi_gpu_tower_stacks: 1 num_envs_per_env_runner: 32 minibatch_buffer_size: 16 - num_sgd_iter: 32 + num_epochs: 32 clip_param: 0.2 lr_schedule: [ [0, 0.0005], diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py index 071cae713fc3..091be32489c3 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py @@ -35,7 +35,7 @@ "fcnet_activation": "linear", "vf_share_layers": True, }, - num_sgd_iter=1, + num_epochs=1, vf_loss_coeff=0.005, vtrace=True, ) diff --git a/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py b/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py index 95277a40920a..a1ed308c55a4 100644 --- a/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py +++ b/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py @@ -25,7 +25,7 @@ policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"), ) .training( - num_sgd_iter=1, + num_epochs=1, vf_loss_coeff=0.005, vtrace=True, model={ diff --git a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py index a8713f4350d3..31ae4c95a90d 100644 --- a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py +++ b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py @@ -9,10 +9,7 @@ from ray.rllib.utils.test_utils import add_rllib_example_script_args from ray.tune.registry import register_env -parser = add_rllib_example_script_args( - default_timesteps=2000000, - default_reward=350.0, -) +parser = add_rllib_example_script_args(default_timesteps=2000000) parser.set_defaults( enable_new_api_stack=True, num_agents=2, @@ -39,7 +36,7 @@ .training( train_batch_size_per_learner=600, lr=0.0005 * ((args.num_gpus or 1) ** 0.5), - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.05, grad_clip=20.0, ) diff --git a/rllib/tuned_examples/appo/pendulum-appo.yaml b/rllib/tuned_examples/appo/pendulum-appo.yaml index dd274338e1f2..6e9f544af4d9 100644 --- a/rllib/tuned_examples/appo/pendulum-appo.yaml +++ b/rllib/tuned_examples/appo/pendulum-appo.yaml @@ -16,7 +16,7 @@ pendulum-appo-vtrace: lr: 0.0003 train_batch_size: 100 minibatch_buffer_size: 16 - num_sgd_iter: 10 + num_epochs: 10 model: fcnet_hiddens: [256, 256] batch_mode: truncate_episodes diff --git a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml index e3a88164a579..94088ab67c29 100644 --- a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml +++ b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml @@ -26,7 +26,7 @@ appo-pongnoframeskip-v5: broadcast_interval: 1 max_sample_requests_in_flight_per_worker: 1 num_envs_per_env_runner: 8 - num_sgd_iter: 2 + num_epochs: 2 vf_loss_coeff: 1.0 clip_param: 0.3 diff --git a/rllib/tuned_examples/appo/pong-appo.yaml b/rllib/tuned_examples/appo/pong-appo.yaml index af2e2afe7248..837e0559a8f8 100644 --- a/rllib/tuned_examples/appo/pong-appo.yaml +++ b/rllib/tuned_examples/appo/pong-appo.yaml @@ -28,7 +28,7 @@ pong-appo: num_multi_gpu_tower_stacks: 1 num_envs_per_env_runner: 8 minibatch_buffer_size: 4 - num_sgd_iter: 2 + num_epochs: 2 vf_loss_coeff: 1.0 clip_param: 0.3 num_gpus: 1 diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo.py b/rllib/tuned_examples/appo/stateless_cartpole_appo.py index 774f3764b738..b6672758dab6 100644 --- a/rllib/tuned_examples/appo/stateless_cartpole_appo.py +++ b/rllib/tuned_examples/appo/stateless_cartpole_appo.py @@ -29,7 +29,7 @@ ) .training( lr=0.0005 * ((args.num_gpus or 1) ** 0.5), - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.05, grad_clip=20.0, ) diff --git a/rllib/tuned_examples/bc/cartpole_recording.py b/rllib/tuned_examples/bc/cartpole_recording.py index 673a48c75900..f05cdcf8c6dc 100644 --- a/rllib/tuned_examples/bc/cartpole_recording.py +++ b/rllib/tuned_examples/bc/cartpole_recording.py @@ -33,7 +33,7 @@ .training( gamma=0.99, lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, use_kl_loss=True, ) diff --git a/rllib/tuned_examples/compact-regression-test.yaml b/rllib/tuned_examples/compact-regression-test.yaml index 74a89ed0a650..21dbdb6d1be4 100644 --- a/rllib/tuned_examples/compact-regression-test.yaml +++ b/rllib/tuned_examples/compact-regression-test.yaml @@ -41,8 +41,8 @@ atari-ppo-tf: entropy_coeff: 0.01 train_batch_size: 5000 rollout_fragment_length: 100 - sgd_minibatch_size: 500 - num_sgd_iter: 10 + minibatch_size: 500 + num_epochs: 10 num_env_runners: 10 num_envs_per_env_runner: 5 batch_mode: truncate_episodes @@ -68,8 +68,8 @@ atari-ppo-torch: entropy_coeff: 0.01 train_batch_size: 5000 rollout_fragment_length: 100 - sgd_minibatch_size: 500 - num_sgd_iter: 10 + minibatch_size: 500 + num_epochs: 10 num_env_runners: 10 num_envs_per_env_runner: 5 batch_mode: truncate_episodes diff --git a/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py b/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py index 2f890e68308f..95b6feb478d5 100644 --- a/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py +++ b/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py @@ -23,7 +23,7 @@ _separate_vf_optimizer=True, # Separate learning rate for the value function branch. _lr_vf=0.00075, - num_sgd_iter=6, + num_epochs=6, # `vf_loss_coeff` will be ignored anyways as we use separate loss terms. vf_loss_coeff=0.01, vtrace=True, diff --git a/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py b/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py index d1748fef0911..0fdf075802d6 100644 --- a/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py +++ b/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py @@ -25,7 +25,7 @@ policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"), ) .training( - num_sgd_iter=1, + num_epochs=1, vf_loss_coeff=0.005, vtrace=True, model={ diff --git a/rllib/tuned_examples/ppo/atari_ppo.py b/rllib/tuned_examples/ppo/atari_ppo.py index 5f06866894a6..8c6629d7ee05 100644 --- a/rllib/tuned_examples/ppo/atari_ppo.py +++ b/rllib/tuned_examples/ppo/atari_ppo.py @@ -59,13 +59,13 @@ def _env_creator(cfg): .training( learner_connector=_make_learner_connector, train_batch_size_per_learner=4000, # 5000 on old yaml example - mini_batch_size_per_learner=128, # 500 on old yaml example + minibatch_size=128, # 500 on old yaml example lambda_=0.95, kl_coeff=0.5, clip_param=0.1, vf_clip_param=10.0, entropy_coeff=0.01, - num_sgd_iter=10, + num_epochs=10, lr=0.00015 * args.num_gpus, grad_clip=100.0, grad_clip_by="global_norm", diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py index e266f1b64902..18de125d7f06 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py @@ -107,9 +107,9 @@ def stop_all(self): gamma=0.99, lambda_=0.95, lr=0.0003, - num_sgd_iter=15, + num_epochs=15, train_batch_size=32 * 512, - sgd_minibatch_size=4096, + minibatch_size=4096, vf_loss_coeff=0.01, model={ "fcnet_hiddens": [64, 64], diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py index 841ee40a52e1..8116a2431cd5 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py @@ -51,8 +51,8 @@ "vf_loss_coeff": [0.01, 1.0], "clip_param": [0.1, 0.3], "kl_target": [0.01, 0.03], - "sgd_minibatch_size": [512, 4096], - "num_sgd_iter": [6, 32], + "minibatch_size": [512, 4096], + "num_epochs": [6, 32], "vf_share_layers": [False, True], "use_kl_loss": [False, True], "kl_coeff": [0.1, 0.4], @@ -96,15 +96,15 @@ vf_loss_coeff=tune.uniform(0.01, 1.0), clip_param=tune.uniform(0.1, 0.3), kl_target=tune.uniform(0.01, 0.03), - sgd_minibatch_size=tune.choice([512, 1024, 2048, 4096]), - num_sgd_iter=tune.randint(6, 32), + minibatch_size=tune.choice([512, 1024, 2048, 4096]), + num_epochs=tune.randint(6, 32), vf_share_layers=tune.choice([True, False]), use_kl_loss=tune.choice([True, False]), kl_coeff=tune.uniform(0.1, 0.4), vf_clip_param=tune.choice([10.0, 40.0, float("inf")]), grad_clip=tune.choice([None, 40, 100, 200]), train_batch_size=tune.sample_from( - lambda spec: spec.config["sgd_minibatch_size"] * num_rollout_workers + lambda spec: spec.config["minibatch_size"] * num_rollout_workers ), model={ "fcnet_hiddens": [64, 64], diff --git a/rllib/tuned_examples/ppo/cartpole-ppo.yaml b/rllib/tuned_examples/ppo/cartpole-ppo.yaml index 2042d496b464..94a093eec3b3 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo.yaml +++ b/rllib/tuned_examples/ppo/cartpole-ppo.yaml @@ -11,7 +11,7 @@ cartpole-ppo: gamma: 0.99 lr: 0.0003 num_env_runners: 1 - num_sgd_iter: 6 + num_epochs: 6 vf_loss_coeff: 0.01 model: fcnet_hiddens: [32] diff --git a/rllib/tuned_examples/ppo/cartpole_ppo.py b/rllib/tuned_examples/ppo/cartpole_ppo.py index cc9171ee5fc7..612f267f188a 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo.py @@ -31,7 +31,7 @@ .training( gamma=0.99, lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, use_kl_loss=True, ) diff --git a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py index a51e21a48b5f..57f1ecffda4a 100644 --- a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py +++ b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py @@ -35,7 +35,7 @@ .training( gamma=0.99, lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, use_kl_loss=True, ) diff --git a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml index 0e050266a96d..96fded2c6a1c 100644 --- a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml +++ b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml @@ -11,11 +11,11 @@ halfcheetah-ppo: gamma: 0.99 lambda: 0.95 kl_coeff: 1.0 - num_sgd_iter: 32 + num_epochs: 32 lr: .0003 vf_loss_coeff: 0.5 clip_param: 0.2 - sgd_minibatch_size: 4096 + minibatch_size: 4096 train_batch_size: 65536 num_env_runners: 16 num_gpus: 1 diff --git a/rllib/tuned_examples/ppo/hopper-ppo.yaml b/rllib/tuned_examples/ppo/hopper-ppo.yaml index a12df2073ee5..3ad4890618f5 100644 --- a/rllib/tuned_examples/ppo/hopper-ppo.yaml +++ b/rllib/tuned_examples/ppo/hopper-ppo.yaml @@ -7,9 +7,9 @@ hopper-ppo: framework: torch gamma: 0.995 kl_coeff: 1.0 - num_sgd_iter: 20 + num_epochs: 20 lr: .0001 - sgd_minibatch_size: 32768 + minibatch_size: 32768 train_batch_size: 160000 num_env_runners: 64 num_gpus: 4 diff --git a/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml b/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml index ace034852908..779e42f50626 100644 --- a/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml +++ b/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml @@ -11,9 +11,9 @@ humanoid-ppo-gae: lambda: 0.95 clip_param: 0.2 kl_coeff: 1.0 - num_sgd_iter: 20 + num_epochs: 20 lr: .0001 - sgd_minibatch_size: 32768 + minibatch_size: 32768 horizon: 5000 train_batch_size: 320000 model: diff --git a/rllib/tuned_examples/ppo/humanoid-ppo.yaml b/rllib/tuned_examples/ppo/humanoid-ppo.yaml index 5a26d07172eb..8a22c2e9607c 100644 --- a/rllib/tuned_examples/ppo/humanoid-ppo.yaml +++ b/rllib/tuned_examples/ppo/humanoid-ppo.yaml @@ -9,9 +9,9 @@ humanoid-ppo: framework: torch gamma: 0.995 kl_coeff: 1.0 - num_sgd_iter: 20 + num_epochs: 20 lr: .0001 - sgd_minibatch_size: 32768 + minibatch_size: 32768 train_batch_size: 320000 model: free_log_std: true diff --git a/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml b/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml index 5eafdd533401..631e65216953 100644 --- a/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml +++ b/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml @@ -13,5 +13,5 @@ memory-leak-test-ppo: num_env_runners: 4 num_envs_per_env_runner: 5 train_batch_size: 500 - sgd_minibatch_size: 256 - num_sgd_iter: 5 + minibatch_size: 256 + num_epochs: 5 diff --git a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py index 65c6d3dc4261..bd3794daf41d 100644 --- a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py +++ b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py @@ -14,5 +14,5 @@ num_env_runners=4, num_envs_per_env_runner=5, ) - .training(train_batch_size=500, sgd_minibatch_size=256, num_sgd_iter=5) + .training(train_batch_size=500, minibatch_size=256, num_epochs=5) ) diff --git a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py index 054cfc056831..0dd22ed050a1 100644 --- a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py @@ -36,7 +36,7 @@ .training( gamma=0.99, lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, use_kl_loss=True, ) diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py index 082d505efcce..42f0398a97bd 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py @@ -31,7 +31,7 @@ lr=0.0003, lambda_=0.1, vf_clip_param=10.0, - num_sgd_iter=6, + num_epochs=6, ) .rl_module( model_config_dict={ diff --git a/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py b/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py index a0b515c7f103..f307dd726fd6 100644 --- a/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py @@ -38,7 +38,7 @@ .training( lr=0.0003 * ((args.num_gpus or 1) ** 0.5), gamma=0.99, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.05, use_kl_loss=True, ) diff --git a/rllib/tuned_examples/ppo/pendulum-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-ppo.yaml index ae60cfd07ec4..7ab57c621a97 100644 --- a/rllib/tuned_examples/ppo/pendulum-ppo.yaml +++ b/rllib/tuned_examples/ppo/pendulum-ppo.yaml @@ -16,7 +16,7 @@ pendulum-ppo: lambda: 0.1 gamma: 0.95 lr: 0.0003 - sgd_minibatch_size: 64 + minibatch_size: 64 observation_filter: MeanStdFilter model: fcnet_activation: relu diff --git a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml index e573eabbfe72..04a12eb3c46d 100644 --- a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml +++ b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml @@ -24,8 +24,8 @@ pendulum-ppo: gamma: 0.95 lr: 0.0003 train_batch_size: 512 - sgd_minibatch_size: 64 - num_sgd_iter: 6 + minibatch_size: 64 + num_epochs: 6 observation_filter: MeanStdFilter model: fcnet_activation: relu diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py index 84c0ddd74f90..9ffd945e0979 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo.py @@ -23,7 +23,7 @@ lr=0.0003, lambda_=0.1, vf_clip_param=10.0, - num_sgd_iter=6, + num_epochs=6, ) .rl_module( model_config_dict={ diff --git a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml index d59329616193..490b63245f15 100644 --- a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml +++ b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml @@ -17,7 +17,7 @@ repeat-after-me-ppo-w-lstm: lr: 0.0003 num_env_runners: 0 num_envs_per_env_runner: 20 - num_sgd_iter: 5 + num_epochs: 5 entropy_coeff: 0.00001 model: use_lstm: true diff --git a/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py b/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py index 0df7a29abbdf..9e188d3982f4 100644 --- a/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py @@ -30,7 +30,7 @@ .training( lr=0.0003 * ((args.num_gpus or 1) ** 0.5), gamma=0.99, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.05, use_kl_loss=True, ) diff --git a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml index cdcfee928c0f..c6ceb1461149 100644 --- a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml +++ b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml @@ -30,13 +30,13 @@ unity3d-soccer-strikers-vs-goalie-ppo: lr: 0.0003 lambda: 0.95 gamma: 0.99 - sgd_minibatch_size: 256 + minibatch_size: 256 train_batch_size: 4000 clip_param: 0.2 # For running in editor, just use one Worker (we only have # one Unity running)! num_env_runners: 10 - num_sgd_iter: 20 + num_epochs: 20 rollout_fragment_length: 200 model: fcnet_hiddens: [512, 512] diff --git a/rllib/tuned_examples/ppo/walker2d-ppo.yaml b/rllib/tuned_examples/ppo/walker2d-ppo.yaml index 13305acc0c9a..9429f0d4161d 100644 --- a/rllib/tuned_examples/ppo/walker2d-ppo.yaml +++ b/rllib/tuned_examples/ppo/walker2d-ppo.yaml @@ -6,9 +6,9 @@ walker2d-v1-ppo: # Works for both torch and tf. framework: torch kl_coeff: 1.0 - num_sgd_iter: 20 + num_epochs: 20 lr: .0001 - sgd_minibatch_size: 32768 + minibatch_size: 32768 train_batch_size: 320000 num_env_runners: 64 num_gpus: 4 diff --git a/rllib/utils/exploration/tests/test_curiosity.py b/rllib/utils/exploration/tests/test_curiosity.py index ddc5939c5df5..4531154371f0 100644 --- a/rllib/utils/exploration/tests/test_curiosity.py +++ b/rllib/utils/exploration/tests/test_curiosity.py @@ -263,7 +263,7 @@ def test_curiosity_on_partially_observable_domain(self): "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", }, - num_sgd_iter=8, + num_epochs=8, ) ) diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index 883d08d84ade..fdcab82146aa 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -9,19 +9,36 @@ @DeveloperAPI class MiniBatchIteratorBase: - """The base class for all minibatch iterators. - - Args: - batch: The input multi-agent batch. - minibatch_size: The size of the minibatch for each module_id. - num_iters: The number of epochs to cover. If the input batch is smaller than - minibatch_size, then the iterator will cycle through the batch until it - has covered num_iters epochs. - """ + """The base class for all minibatch iterators.""" def __init__( - self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int = 1 + self, + batch: MultiAgentBatch, + *, + num_epochs: int = 1, + shuffle_batch_per_epoch: bool = True, + minibatch_size: int, + num_total_minibatches: int = 0, ) -> None: + """Initializes a MiniBatchIteratorBase instance. + + Args: + batch: The input multi-agent batch. + num_epochs: The number of complete passes over the entire train batch. Each + pass might be further split into n minibatches (if `minibatch_size` + provided). The train batch is generated from the given `episodes` + through the Learner connector pipeline. + minibatch_size: The size of minibatches to use to further split the train + batch into per epoch. The train batch is generated from the given + `episodes` through the Learner connector pipeline. + num_total_minibatches: The total number of minibatches to loop through + (over all `num_epochs` epochs). It's only required to set this to != 0 + in multi-agent + multi-GPU situations, in which the MultiAgentEpisodes + themselves are roughly sharded equally, however, they might contain + SingleAgentEpisodes with very lopsided length distributions. Thus, + without this fixed, pre-computed value, one Learner might go through a + different number of minibatche passes than others causing a deadlock. + """ pass @@ -29,58 +46,56 @@ def __init__( class MiniBatchCyclicIterator(MiniBatchIteratorBase): """This implements a simple multi-agent minibatch iterator. - This iterator will split the input multi-agent batch into minibatches where the size of batch for each module_id (aka policy_id) is equal to minibatch_size. If the input batch is smaller than minibatch_size, then the iterator will cycle through - the batch until it has covered num_iters epochs. - - Args: - batch: The input multi-agent batch. - minibatch_size: The size of the minibatch for each module_id. - num_iters: The minimum number of epochs to cover. If the input batch is smaller - than minibatch_size, then the iterator will cycle through the batch until - it has covered at least num_iters epochs. + the batch until it has covered `num_epochs` epochs. """ - def __init__( self, batch: MultiAgentBatch, + *, + num_epochs: int = 1, + shuffle_batch_per_epoch: bool = True, minibatch_size: int, - num_iters: int = 1, - uses_new_env_runners: bool = False, - num_total_mini_batches: int = 0, - shuffle: bool = False, + num_total_minibatches: int = 0, + _uses_new_env_runners: bool = False, ) -> None: - super().__init__(batch, minibatch_size, num_iters) + """Initializes a MiniBatchCyclicIterator instance.""" + super().__init__( + batch, + num_epochs=num_epochs, + minibatch_size=minibatch_size, + shuffle_batch_per_epoch=shuffle_batch_per_epoch, + ) + self._batch = batch self._minibatch_size = minibatch_size - self._num_iters = num_iters + self._num_epochs = num_epochs + self._shuffle_batch_per_epoch = shuffle_batch_per_epoch # mapping from module_id to the start index of the batch self._start = {mid: 0 for mid in batch.policy_batches.keys()} # mapping from module_id to the number of epochs covered for each module_id self._num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()} - self._uses_new_env_runners = uses_new_env_runners - - self._mini_batch_count = 0 - self._num_total_mini_batches = num_total_mini_batches + self._uses_new_env_runners = _uses_new_env_runners - self._shuffle = shuffle + self._minibatch_count = 0 + self._num_total_minibatches = num_total_minibatches def __iter__(self): while ( # Make sure each item in the total batch gets at least iterated over - # `self._num_iters` times. + # `self._num_epochs` times. ( - self._num_total_mini_batches == 0 - and min(self._num_covered_epochs.values()) < self._num_iters + self._num_total_minibatches == 0 + and min(self._num_covered_epochs.values()) < self._num_epochs ) # Make sure we reach at least the given minimum number of mini-batches. or ( - self._num_total_mini_batches > 0 - and self._mini_batch_count < self._num_total_mini_batches + self._num_total_minibatches > 0 + and self._minibatch_count < self._num_total_minibatches ) ): minibatch = {} @@ -89,7 +104,7 @@ def __iter__(self): # Shuffle the individual single-agent batch, if required. # This should happen once per minibatch iteration in order to make # each iteration go through a different set of minibatches. - if self._shuffle: + if self._shuffle_batch_per_epoch: module_batch.shuffle() if len(module_batch) == 0: @@ -166,12 +181,12 @@ def get_len(b): minibatch = MultiAgentBatch(minibatch, len(self._batch)) yield minibatch - self._mini_batch_count += 1 + self._minibatch_count += 1 class MiniBatchDummyIterator(MiniBatchIteratorBase): - def __init__(self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int = 1): - super().__init__(batch, minibatch_size, num_iters) + def __init__(self, batch: MultiAgentBatch, **kwargs): + super().__init__(batch, **kwargs) self._batch = batch def __iter__(self): diff --git a/rllib/utils/tests/test_minibatch_utils.py b/rllib/utils/tests/test_minibatch_utils.py index cd5f3bbddf4d..879e8b522a1b 100644 --- a/rllib/utils/tests/test_minibatch_utils.py +++ b/rllib/utils/tests/test_minibatch_utils.py @@ -14,20 +14,20 @@ tf1.enable_eager_execution() CONFIGS = [ - {"mini_batch_size": 256, "num_sgd_iter": 30, "agent_steps": (1652, 1463)}, - {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (1000, 2)}, - {"mini_batch_size": 128, "num_sgd_iter": 3, "agent_steps": (56, 56)}, - {"mini_batch_size": 128, "num_sgd_iter": 7, "agent_steps": (56, 56)}, - {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 56)}, - {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 3)}, - {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 4)}, - {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 55)}, - {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (400, 400)}, - {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (64, 64)}, + {"minibatch_size": 256, "num_epochs": 30, "agent_steps": (1652, 1463)}, + {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (1000, 2)}, + {"minibatch_size": 128, "num_epochs": 3, "agent_steps": (56, 56)}, + {"minibatch_size": 128, "num_epochs": 7, "agent_steps": (56, 56)}, + {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 56)}, + {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 3)}, + {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 4)}, + {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 55)}, + {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (400, 400)}, + {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (64, 64)}, # W/ SEQ_LENS. { - "mini_batch_size": 64, - "num_sgd_iter": 1, + "minibatch_size": 64, + "num_epochs": 1, "agent_steps": (128,), "seq_lens": [16, 16, 16, 16, 16, 16, 2, 2, 14, 14], "padding": True, @@ -39,8 +39,8 @@ class TestMinibatchUtils(unittest.TestCase): def test_minibatch_cyclic_iterator(self): for config in CONFIGS: - mini_batch_size = config["mini_batch_size"] - num_sgd_iter = config["num_sgd_iter"] + minibatch_size = config["minibatch_size"] + num_epochs = config["num_epochs"] agent_steps = config["agent_steps"] seq_lens = config.get("seq_lens") max_seq_len = None @@ -85,7 +85,9 @@ def test_minibatch_cyclic_iterator(self): ) mb = MultiAgentBatch(sample_batches, num_env_steps) - batch_iter = MiniBatchCyclicIterator(mb, mini_batch_size, num_sgd_iter) + batch_iter = MiniBatchCyclicIterator( + mb, minibatch_size=minibatch_size, num_epochs=num_epochs + ) print(config) iteration_counter = 0 for batch in batch_iter: @@ -94,14 +96,14 @@ def test_minibatch_cyclic_iterator(self): print(batch["pol0"]["obs"]) print("*" * 80) # Check that for each policy the batch size is equal to the - # mini_batch_size. + # minibatch_size. for policy_batch in batch.policy_batches.values(): - check(policy_batch.count, mini_batch_size) + check(policy_batch.count, minibatch_size) iteration_counter += 1 # For each policy check that the last item in batch matches the expected - # values, i.e. iteration_counter * mini_batch_size % agent_steps - 1. - total_steps = iteration_counter * mini_batch_size + # values, i.e. iteration_counter * minibatch_size % agent_steps - 1. + total_steps = iteration_counter * minibatch_size for policy_idx, policy_batch in enumerate( batch.policy_batches.values() ): @@ -111,9 +113,9 @@ def test_minibatch_cyclic_iterator(self): check(policy_batch["obs"][-1], expected_last_item) # Check iteration counter (should be - # ceil(num_gsd_iter * max(agent_steps) / mini_batch_size)). + # ceil(num_gsd_iter * max(agent_steps) / minibatch_size)). expected_iteration_counter = np.ceil( - num_sgd_iter * max(agent_steps) / mini_batch_size + num_epochs * max(agent_steps) / minibatch_size ) if not seq_lens: check(iteration_counter, expected_iteration_counter) From 38f0d99476c07dcb242405aef61d722574f46801 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 2 Sep 2024 16:36:06 +0200 Subject: [PATCH 03/20] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo.py | 1 + rllib/algorithms/impala/impala.py | 1 + rllib/algorithms/impala/impala_learner.py | 26 +++++++++++++++++++++++ rllib/policy/sample_batch.py | 3 +++ 4 files changed, 31 insertions(+) diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 73ceef6f3264..d2db78febbce 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -98,6 +98,7 @@ def __init__(self, algo_class=None): self.use_kl_loss = False self.kl_coeff = 1.0 self.kl_target = 0.01 + #self.shuffle_batch_per_epoch = True # Override some of IMPALAConfig's default values with APPO-specific values. self.num_env_runners = 2 diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 9ad590f72f34..ea9da381b828 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -167,6 +167,7 @@ def __init__(self, algo_class=None): self._lr_vf = 0.0005 # @OldAPIstack # Override some of AlgorithmConfig's default values with IMPALA-specific values. + self.num_learners = 1 self.rollout_fragment_length = 50 self.train_batch_size = 500 # @OldAPIstack self.train_batch_size_per_learner = 500 diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index f6f6df0cdb1e..e0962226db48 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -11,6 +11,8 @@ from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY from ray.rllib.core.columns import Columns from ray.rllib.core.learner.learner import Learner +from ray.rllib.connectors.common import AddStatesFromEpisodesToBatch +from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch from ray.rllib.utils.annotations import ( @@ -65,6 +67,10 @@ def build(self) -> None: # slots to mask out). if self.config.add_default_connectors_to_learner_pipeline: self._learner_connector.prepend(AddOneTsToEpisodesAndTruncate()) + self._learner_connector.insert_after( + AddStatesFromEpisodesToBatch, + AddVTraceSeqLensNoRNN, + ) # Create and start the GPU-loader thread. It picks up train-ready batches from # the "GPU-loader queue" and loads them to the GPU, then places the GPU batches @@ -287,3 +293,23 @@ def step(self): self._out_queue.put(copy.deepcopy(results)) self.metrics.log_value(QUEUE_SIZE_RESULTS_QUEUE, self._out_queue.qsize()) + + +class AddVTraceSeqLensNoRNN(ConnectorV2): + def __init__( + self, + input_observation_space=None, + input_action_space=None, + *, + rollout_fragment_length: int, + **kwargs, + ): + super().__init__(input_observation_space, input_action_space, **kwargs) + self._rollout_fragment_length = rollout_fragment_length + + @override(ConnectorV2) + def __call__(self, *, rl_module, batch, episodes): + if Columns.SEQ_LENS not in batch: + pass + TODO # Continue implementing here + return batch diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py index 098ddc2218ad..33a0b5eea25b 100644 --- a/rllib/policy/sample_batch.py +++ b/rllib/policy/sample_batch.py @@ -483,7 +483,10 @@ def shuffle(self) -> "SampleBatch": permutation = np.random.permutation(len(self[SampleBatch.SEQ_LENS])) self_as_dict = dict(self) + infos = self_as_dict.pop(Columns.INFOS, None) shuffled = tree.map_structure(lambda v: v[permutation], self_as_dict) + if infos is not None: + self_as_dict[Columns.INFOS] = [infos[i] for i in permutation] self.update(shuffled) From ea8075f793ad49d2afc6af869ea5e6abc8d5f7f2 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 2 Sep 2024 19:47:14 +0200 Subject: [PATCH 04/20] wip Signed-off-by: sven1977 --- rllib/tuned_examples/ppo/cartpole_ppo.py | 23 ++++------------------- rllib/utils/minibatch_utils.py | 11 +++++------ 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/rllib/tuned_examples/ppo/cartpole_ppo.py b/rllib/tuned_examples/ppo/cartpole_ppo.py index 612f267f188a..9d8f09d43e06 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo.py @@ -1,13 +1,9 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - EVALUATION_RESULTS, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args() +parser = add_rllib_example_script_args( + default_reward=450.0, default_timesteps=300000 +) parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. @@ -35,21 +31,10 @@ vf_loss_coeff=0.01, use_kl_loss=True, ) - .evaluation( - evaluation_num_env_runners=1, - evaluation_interval=1, - evaluation_parallel_to_training=True, - evaluation_config=PPOConfig.overrides(exploration=False), - ) ) -stop = { - f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 350.0, -} - if __name__ == "__main__": from ray.rllib.utils.test_utils import run_rllib_example_script_experiment - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment(config, args) diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index fdcab82146aa..cad11ddac9aa 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -101,12 +101,6 @@ def __iter__(self): minibatch = {} for module_id, module_batch in self._batch.policy_batches.items(): - # Shuffle the individual single-agent batch, if required. - # This should happen once per minibatch iteration in order to make - # each iteration go through a different set of minibatches. - if self._shuffle_batch_per_epoch: - module_batch.shuffle() - if len(module_batch) == 0: raise ValueError( f"The batch for module_id {module_id} is empty! " @@ -164,6 +158,11 @@ def get_len(b): n_steps -= len_sample s = 0 self._num_covered_epochs[module_id] += 1 + # Shuffle the individual single-agent batch, if required. + # This should happen once per minibatch iteration in order to make + # each iteration go through a different set of minibatches. + if self._shuffle_batch_per_epoch: + module_batch.shuffle() e = s + n_steps # end if e > s: From 4e1e42eb1993d98c6ac7ac3330bd859ac175a79b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 3 Sep 2024 10:36:15 +0200 Subject: [PATCH 05/20] wip Signed-off-by: sven1977 --- rllib/algorithms/impala/impala.py | 45 +++++++------------ rllib/core/learner/learner.py | 31 ++++++++++--- rllib/core/learner/learner_group.py | 51 ++++++++++++++++------ rllib/tuned_examples/appo/cartpole_appo.py | 2 +- rllib/tuned_examples/ppo/cartpole_ppo.py | 13 +----- rllib/utils/minibatch_utils.py | 1 + 6 files changed, 83 insertions(+), 60 deletions(-) diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index ea9da381b828..a6c69876aaf9 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -171,7 +171,6 @@ def __init__(self, algo_class=None): self.rollout_fragment_length = 50 self.train_batch_size = 500 # @OldAPIstack self.train_batch_size_per_learner = 500 - #self._minibatch_size = "auto" self.num_env_runners = 2 self.num_gpus = 1 # @OldAPIstack self.lr = 0.0005 @@ -436,21 +435,21 @@ def validate(self) -> None: "config.training(_tf_policy_handles_more_than_one_loss=True)." ) # Learner API specific checks. - #if ( - # self.enable_rl_module_and_learner - # and self._minibatch_size != "auto" - # and not ( - # (self.minibatch_size % self.rollout_fragment_length == 0) - # and self.minibatch_size <= self.total_train_batch_size - # ) - #): - # raise ValueError( - # f"`minibatch_size` ({self._minibatch_size}) must either be 'auto' " - # "or a multiple of `rollout_fragment_length` " - # f"({self.rollout_fragment_length}) while at the same time smaller " - # "than or equal to `total_train_batch_size` " - # f"({self.total_train_batch_size})!" - # ) + if ( + self.enable_rl_module_and_learner + and self.minibatch_size is not None + and not ( + (self.minibatch_size % self.rollout_fragment_length == 0) + and self.minibatch_size <= self.total_train_batch_size + ) + ): + raise ValueError( + f"`minibatch_size` ({self._minibatch_size}) must either be None " + "or a multiple of `rollout_fragment_length` " + f"({self.rollout_fragment_length}) while at the same time smaller " + "than or equal to `total_train_batch_size` " + f"({self.total_train_batch_size})!" + ) @property def replay_ratio(self) -> float: @@ -460,20 +459,6 @@ def replay_ratio(self) -> float: """ return (1 / self.replay_proportion) if self.replay_proportion > 0 else 0.0 - #@property - #def minibatch_size(self): - # # If 'auto', use the train_batch_size (meaning each SGD iter is a single pass - # # through the entire train batch). Otherwise, use user provided setting. - # return ( - # ( - # self.train_batch_size_per_learner - # if self.enable_env_runner_and_connector_v2 - # else self.train_batch_size - # ) - # if self._minibatch_size == "auto" - # else self._minibatch_size - # ) - @override(AlgorithmConfig) def get_default_learner_class(self): if self.framework_str == "torch": diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index b5873ac7e826..8e8247741ebd 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -927,6 +927,7 @@ def update_from_batch( timesteps: Optional[Dict[str, Any]] = None, num_epochs: int = 1, minibatch_size: Optional[int] = None, + shuffle_batch_per_epoch: bool = True, # Deprecated args. num_iters=DEPRECATED_VALUE, ) -> ResultDict: @@ -946,7 +947,14 @@ def update_from_batch( provided). The train batch is generated from the given `episodes` through the Learner connector pipeline. minibatch_size: The size of minibatches to use to further split the train - batch into. + `batch` into sub-batches. The `batch` is then iterated over n times + where n is `len(batch) // minibatch_size`. + shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch. + If the train batch has a time rank (axis=1), shuffling will only take + place along the batch axis to not disturb any intact (episode) + trajectories. Also, shuffling is always skipped if `minibatch_size` is + None, meaning the entire train batch is processed each epoch, making it + unnecessary to shuffle. Returns: A `ResultDict` object produced by a call to `self.metrics.reduce()`. The @@ -966,6 +974,7 @@ def update_from_batch( timesteps=timesteps, num_epochs=num_epochs, minibatch_size=minibatch_size, + shuffle_batch_per_epoch=shuffle_batch_per_epoch, ) def update_from_episodes( @@ -977,6 +986,7 @@ def update_from_episodes( num_epochs: int = 1, minibatch_size: Optional[int] = None, num_total_minibatches: int = 0, + shuffle_batch_per_epoch: bool = True, # Deprecated args. num_iters=DEPRECATED_VALUE, ) -> ResultDict: @@ -996,8 +1006,16 @@ def update_from_episodes( provided). The train batch is generated from the given `episodes` through the Learner connector pipeline. minibatch_size: The size of minibatches to use to further split the train - batch into. The train batch is generated from the given `episodes` - through the Learner connector pipeline. + `batch` into sub-batches. The `batch` is then iterated over n times + where n is `len(batch) // minibatch_size`. The train batch is generated + from the given `episodes` through the Learner connector pipeline. + shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch. + If the train batch has a time rank (axis=1), shuffling will only take + place along the batch axis to not disturb any intact (episode) + trajectories. Also, shuffling is always skipped if `minibatch_size` is + None, meaning the entire train batch is processed each epoch, making it + unnecessary to shuffle. The train batch is generated from the given + `episodes` through the Learner connector pipeline. num_total_minibatches: The total number of minibatches to loop through (over all `num_epochs` epochs). It's only required to set this to != 0 in multi-agent + multi-GPU situations, in which the MultiAgentEpisodes @@ -1024,6 +1042,7 @@ def update_from_episodes( timesteps=timesteps, minibatch_size=minibatch_size, num_epochs=num_epochs, + shuffle_batch_per_epoch=shuffle_batch_per_epoch, num_total_minibatches=num_total_minibatches, ) @@ -1037,7 +1056,7 @@ def update_from_iterator( **kwargs, ): self._check_is_built() - #minibatch_size = minibatch_size or 32 + # minibatch_size = minibatch_size or 32 # Call `before_gradient_based_update` to allow for non-gradient based # preparations-, logging-, and update logic to happen. @@ -1294,7 +1313,9 @@ def _update_from_batch_or_episodes( if minibatch_size: if self._learner_connector is not None: - batch_iter = partial(MiniBatchCyclicIterator, _uses_new_env_runners=True) + batch_iter = partial( + MiniBatchCyclicIterator, _uses_new_env_runners=True + ) else: batch_iter = MiniBatchCyclicIterator elif num_epochs > 1: diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index d746265c9b23..48bd5628b6ef 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -222,7 +222,7 @@ def update_from_batch( async_update: bool = False, return_state: bool = False, num_epochs: int = 1, - shuffle_batch_per_epoch: bool = False, + shuffle_batch_per_epoch: bool = True, minibatch_size: Optional[int] = None, # User kwargs. **kwargs, @@ -243,9 +243,18 @@ def update_from_batch( Learner workers' states should be identical, so we use the first Learner's state here. Useful for avoiding an extra `get_weights()` call, e.g. for synchronizing EnvRunner weights. - minibatch_size: The minibatch size to use for the update. - num_iters: The number of complete passes over all the sub-batches in the - input multi-agent batch. + num_epochs: The number of complete passes over the entire train batch. Each + pass might be further split into n minibatches (if `minibatch_size` + provided). + minibatch_size: The size of minibatches to use to further split the train + `batch` into sub-batches. The `batch` is then iterated over n times + where n is `len(batch) // minibatch_size`. + shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch. + If the train batch has a time rank (axis=1), shuffling will only take + place along the batch axis to not disturb any intact (episode) + trajectories. Also, shuffling is always skipped if `minibatch_size` is + None, meaning the entire train batch is processed each epoch, making it + unnecessary to shuffle. Returns: If `async_update` is False, a dictionary with the reduced results of the @@ -262,8 +271,9 @@ def update_from_batch( timesteps=timesteps, async_update=async_update, return_state=return_state, - minibatch_size=minibatch_size, num_epochs=num_epochs, + minibatch_size=minibatch_size, + shuffle_batch_per_epoch=shuffle_batch_per_epoch, **kwargs, ) @@ -275,8 +285,8 @@ def update_from_episodes( async_update: bool = False, return_state: bool = False, num_epochs: int = 1, - shuffle_batch_per_epoch: bool = False, minibatch_size: Optional[int] = None, + shuffle_batch_per_epoch: bool = True, # User kwargs. **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: @@ -296,9 +306,21 @@ def update_from_episodes( Learner workers' states should be identical, so we use the first Learner's state here. Useful for avoiding an extra `get_weights()` call, e.g. for synchronizing EnvRunner weights. - minibatch_size: The minibatch size to use for the update. - num_iters: The number of complete passes over all the sub-batches in the - input multi-agent batch. + num_epochs: The number of complete passes over the entire train batch. Each + pass might be further split into n minibatches (if `minibatch_size` + provided). The train batch is generated from the given `episodes` + through the Learner connector pipeline. + minibatch_size: The size of minibatches to use to further split the train + `batch` into sub-batches. The `batch` is then iterated over n times + where n is `len(batch) // minibatch_size`. The train batch is generated + from the given `episodes` through the Learner connector pipeline. + shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch. + If the train batch has a time rank (axis=1), shuffling will only take + place along the batch axis to not disturb any intact (episode) + trajectories. Also, shuffling is always skipped if `minibatch_size` is + None, meaning the entire train batch is processed each epoch, making it + unnecessary to shuffle. The train batch is generated from the given + `episodes` through the Learner connector pipeline. Returns: If async_update is False, a dictionary with the reduced results of the @@ -315,8 +337,9 @@ def update_from_episodes( timesteps=timesteps, async_update=async_update, return_state=return_state, - minibatch_size=minibatch_size, num_epochs=num_epochs, + minibatch_size=minibatch_size, + shuffle_batch_per_epoch=shuffle_batch_per_epoch, **kwargs, ) @@ -330,7 +353,7 @@ def _update( return_state: bool = False, num_epochs: int = 1, minibatch_size: Optional[int] = None, - shuffle_batch_per_epoch: bool = False, + shuffle_batch_per_epoch: bool = True, # Deprecated args. num_iters=DEPRECATED_VALUE, **kwargs, @@ -365,16 +388,18 @@ def _learner_update( result = _learner.update_from_batch( batch=_batch_shard, timesteps=_timesteps, - minibatch_size=minibatch_size, num_epochs=num_epochs, + minibatch_size=minibatch_size, + shuffle_batch_per_epoch=shuffle_batch_per_epoch, **_kwargs, ) else: result = _learner.update_from_episodes( episodes=_episodes_shard, timesteps=_timesteps, - minibatch_size=minibatch_size, num_epochs=num_epochs, + minibatch_size=minibatch_size, + shuffle_batch_per_epoch=shuffle_batch_per_epoch, num_total_minibatches=_num_total_minibatches, **_kwargs, ) diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py index 865c4ce85c31..e8ffd6cff4f9 100644 --- a/rllib/tuned_examples/appo/cartpole_appo.py +++ b/rllib/tuned_examples/appo/cartpole_appo.py @@ -35,7 +35,7 @@ .rl_module( model_config_dict={ "fcnet_hiddens": [32], - #"vf_share_layers": True, + # "vf_share_layers": True, "uses_new_env_runners": True, }, ) diff --git a/rllib/tuned_examples/ppo/cartpole_ppo.py b/rllib/tuned_examples/ppo/cartpole_ppo.py index 27d931db04c6..18f11b9d8ffa 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo.py @@ -1,9 +1,7 @@ from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args( - default_reward=450.0, default_timesteps=300000 -) +parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000) parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. @@ -14,7 +12,7 @@ .environment("CartPole-v1") .training( lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, ) .rl_module( @@ -24,13 +22,6 @@ "vf_share_layers": True, } ) - .training( - gamma=0.99, - lr=0.0003, - num_epochs=6, - vf_loss_coeff=0.01, - use_kl_loss=True, - ) ) diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index cad11ddac9aa..f07b7f23e64f 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -51,6 +51,7 @@ class MiniBatchCyclicIterator(MiniBatchIteratorBase): input batch is smaller than minibatch_size, then the iterator will cycle through the batch until it has covered `num_epochs` epochs. """ + def __init__( self, batch: MultiAgentBatch, From 61c3f2080adace4a51ccaf24188d3a3baf2afe7c Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 3 Sep 2024 11:23:18 +0200 Subject: [PATCH 06/20] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo.py | 5 +++- rllib/algorithms/impala/impala_learner.py | 27 +--------------------- rllib/algorithms/ppo/ppo.py | 9 ++++---- rllib/core/learner/learner.py | 6 ++--- rllib/core/learner/learner_group.py | 6 ++--- rllib/tuned_examples/appo/cartpole_appo.py | 14 ++--------- rllib/tuned_examples/ppo/pendulum_ppo.py | 5 ---- 7 files changed, 18 insertions(+), 54 deletions(-) diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index d2db78febbce..ad21a9780dc3 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -98,7 +98,10 @@ def __init__(self, algo_class=None): self.use_kl_loss = False self.kl_coeff = 1.0 self.kl_target = 0.01 - #self.shuffle_batch_per_epoch = True + # TODO (sven): Activate once v-trace sequences in non-RNN batch are solved. + # If we switch this on right now, the shuffling would destroy the rollout + # sequences (non-zero-padded!) needed in the batch for v-trace. + # self.shuffle_batch_per_epoch = True # Override some of IMPALAConfig's default values with APPO-specific values. self.num_env_runners = 2 diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index a6caea1b0fe7..6c40c79af17f 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -11,8 +11,6 @@ from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY from ray.rllib.core.columns import Columns from ray.rllib.core.learner.learner import Learner -from ray.rllib.connectors.common import AddStatesFromEpisodesToBatch -from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch from ray.rllib.utils.annotations import ( @@ -70,10 +68,6 @@ def build(self) -> None: and self.config.add_default_connectors_to_learner_pipeline ): self._learner_connector.prepend(AddOneTsToEpisodesAndTruncate()) - self._learner_connector.insert_after( - AddStatesFromEpisodesToBatch, - AddVTraceSeqLensNoRNN, - ) # Create and start the GPU-loader thread. It picks up train-ready batches from # the "GPU-loader queue" and loads them to the GPU, then places the GPU batches @@ -118,6 +112,7 @@ def update_from_episodes( # algos that actually need (and know how) to do minibatching. minibatch_size: Optional[int] = None, num_epochs: int = 1, + shuffle_batch_per_epoch: bool = False, num_total_minibatches: int = 0, reduce_fn=None, # Deprecated args. **kwargs, @@ -296,23 +291,3 @@ def step(self): self._out_queue.put(copy.deepcopy(results)) self.metrics.log_value(QUEUE_SIZE_RESULTS_QUEUE, self._out_queue.qsize()) - - -class AddVTraceSeqLensNoRNN(ConnectorV2): - def __init__( - self, - input_observation_space=None, - input_action_space=None, - *, - rollout_fragment_length: int, - **kwargs, - ): - super().__init__(input_observation_space, input_action_space, **kwargs) - self._rollout_fragment_length = rollout_fragment_length - - @override(ConnectorV2) - def __call__(self, *, rl_module, batch, episodes): - if Columns.SEQ_LENS not in batch: - pass - TODO # Continue implementing here - return batch diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 40efe71a5b9f..558983875df6 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -130,17 +130,17 @@ def __init__(self, algo_class=None): self.lr = 5e-5 self.rollout_fragment_length = "auto" self.train_batch_size = 4000 - self.shuffle_batch_per_epoch = True # PPO specific settings: self.use_critic = True self.use_gae = True + self.num_epochs = 30 + self.minibatch_size = 128 + self.shuffle_batch_per_epoch = True self.lambda_ = 1.0 self.use_kl_loss = True self.kl_coeff = 0.2 self.kl_target = 0.01 - self.minibatch_size = 128 - self.num_epochs = 30 self.vf_loss_coeff = 1.0 self.entropy_coeff = 0.0 self.entropy_coeff_schedule = None @@ -467,8 +467,9 @@ def _training_step_new_api_stack(self) -> ResultDict: self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) ), }, - minibatch_size=self.config.minibatch_size, num_epochs=self.config.num_epochs, + minibatch_size=self.config.minibatch_size, + shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch, ) self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS) self.metrics.log_dict( diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 8e8247741ebd..c9e6aed75637 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -927,7 +927,7 @@ def update_from_batch( timesteps: Optional[Dict[str, Any]] = None, num_epochs: int = 1, minibatch_size: Optional[int] = None, - shuffle_batch_per_epoch: bool = True, + shuffle_batch_per_epoch: bool = False, # Deprecated args. num_iters=DEPRECATED_VALUE, ) -> ResultDict: @@ -986,7 +986,7 @@ def update_from_episodes( num_epochs: int = 1, minibatch_size: Optional[int] = None, num_total_minibatches: int = 0, - shuffle_batch_per_epoch: bool = True, + shuffle_batch_per_epoch: bool = False, # Deprecated args. num_iters=DEPRECATED_VALUE, ) -> ResultDict: @@ -1245,7 +1245,7 @@ def _update_from_batch_or_episodes( # algos that actually need (and know how) to do minibatching. minibatch_size: Optional[int] = None, num_epochs: int = 1, - shuffle_batch_per_epoch: bool = True, + shuffle_batch_per_epoch: bool = False, num_total_minibatches: int = 0, ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index 48bd5628b6ef..07492a8b5611 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -222,7 +222,7 @@ def update_from_batch( async_update: bool = False, return_state: bool = False, num_epochs: int = 1, - shuffle_batch_per_epoch: bool = True, + shuffle_batch_per_epoch: bool = False, minibatch_size: Optional[int] = None, # User kwargs. **kwargs, @@ -286,7 +286,7 @@ def update_from_episodes( return_state: bool = False, num_epochs: int = 1, minibatch_size: Optional[int] = None, - shuffle_batch_per_epoch: bool = True, + shuffle_batch_per_epoch: bool = False, # User kwargs. **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: @@ -353,7 +353,7 @@ def _update( return_state: bool = False, num_epochs: int = 1, minibatch_size: Optional[int] = None, - shuffle_batch_per_epoch: bool = True, + shuffle_batch_per_epoch: bool = False, # Deprecated args. num_iters=DEPRECATED_VALUE, **kwargs, diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py index e8ffd6cff4f9..6a2ccf143464 100644 --- a/rllib/tuned_examples/appo/cartpole_appo.py +++ b/rllib/tuned_examples/appo/cartpole_appo.py @@ -1,9 +1,4 @@ from ray.rllib.algorithms.appo import APPOConfig -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import add_rllib_example_script_args parser = add_rllib_example_script_args( @@ -25,17 +20,12 @@ ) .environment("CartPole-v1") .training( - train_batch_size_per_learner=1000, vf_loss_coeff=0.05, - entropy_coeff=0.01, - num_epochs=2, - lr=0.00075, - minibatch_size=250, + entropy_coeff=0.0, ) .rl_module( model_config_dict={ - "fcnet_hiddens": [32], - # "vf_share_layers": True, + "vf_share_layers": True, "uses_new_env_runners": True, }, ) diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py index aa0c5d1027b5..5df6e3e78855 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo.py @@ -9,11 +9,6 @@ config = ( PPOConfig() - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( num_env_runners=2, num_envs_per_env_runner=10, From a20f44c937c7631f93874b152167cbc3078431df Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 3 Sep 2024 13:02:30 +0200 Subject: [PATCH 07/20] fix Signed-off-by: sven1977 --- rllib/policy/torch_policy.py | 10 +++++++--- rllib/policy/torch_policy_v2.py | 11 +++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index e4db6d37a5c0..5abd0c9922f8 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -548,9 +548,13 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): # Get the correct slice of the already loaded batch to use, # based on offset and batch size. - device_batch_size = self.config.get( - "minibatch_size", self.config["train_batch_size"] - ) // len(self.devices) + device_batch_size = self.config.get("minibatch_size") + if device_batch_size is None: + device_batch_size = self.config.get( + "sgd_minibatch_size", + self.config["train_batch_size"], + ) + device_batch_size //= len(self.devices) # Set Model to train mode. if self.model_gpu_towers: diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 649fc19f88e3..a61116fb712c 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -838,10 +838,13 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): # Get the correct slice of the already loaded batch to use, # based on offset and batch size. - device_batch_size = self.config.get( - "minibatch_size", - self.config.get("sgd_minibatch_size", self.config["train_batch_size"]), - ) // len(self.devices) + device_batch_size = self.config.get("minibatch_size") + if device_batch_size is None: + device_batch_size = self.config.get( + "sgd_minibatch_size", + self.config["train_batch_size"], + ) + device_batch_size //= len(self.devices) # Set Model to train mode. if self.model_gpu_towers: From b966d998dd6f0e80f4c019a3c6ded43bf4f8a7b1 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 3 Sep 2024 14:20:22 +0200 Subject: [PATCH 08/20] fix Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 12 ++++++------ .../common/add_states_from_episodes_to_batch.py | 9 +++++++++ rllib/core/learner/learner.py | 14 +++++++++----- rllib/core/learner/learner_group.py | 2 +- rllib/utils/minibatch_utils.py | 2 +- 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 233469a5c429..6eb78b4a6532 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -386,8 +386,8 @@ def __init__(self, algo_class: Optional[type] = None): # These setting have been adopted from the original PPO batch settings: # num_sgd_iter, minibatch_size, and shuffle_sequences. self.num_epochs = 1 - self.shuffle_batch_per_epoch = False self.minibatch_size = None + self.shuffle_batch_per_epoch = False # TODO (sven): Unsolved problem with RLModules sometimes requiring settings from # the main AlgorithmConfig. We should not require the user to provide those @@ -2053,8 +2053,8 @@ def training( train_batch_size: Optional[int] = NotProvided, train_batch_size_per_learner: Optional[int] = NotProvided, num_epochs: Optional[int] = NotProvided, - shuffle_batch_per_epoch: Optional[bool] = NotProvided, minibatch_size: Optional[int] = NotProvided, + shuffle_batch_per_epoch: Optional[bool] = NotProvided, model: Optional[dict] = NotProvided, optimizer: Optional[dict] = NotProvided, max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided, @@ -2116,12 +2116,12 @@ def training( num_epochs: The number of complete passes over the entire train batch (per Learner). Each pass might be further split into n minibatches (if `minibatch_size` provided). + minibatch_size: The size of minibatches to use to further split the train + batch into. shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch. If the train batch has a time rank (axis=1), shuffling will only take place along the batch axis to not disturb any intact (episode) trajectories. - minibatch_size: The size of minibatches to use to further split the train - batch into. model: Arguments passed into the policy model. See models/catalog.py for a full list of the available model options. TODO: Provide ModelConfig objects instead of dicts. @@ -2187,10 +2187,10 @@ def training( self.train_batch_size = train_batch_size if num_epochs is not NotProvided: self.num_epochs = num_epochs - if shuffle_batch_per_epoch is not NotProvided: - self.shuffle_batch_per_epoch = shuffle_batch_per_epoch if minibatch_size is not NotProvided: self.minibatch_size = minibatch_size + if shuffle_batch_per_epoch is not NotProvided: + self.shuffle_batch_per_epoch = shuffle_batch_per_epoch if model is not NotProvided: self.model.update(model) diff --git a/rllib/connectors/common/add_states_from_episodes_to_batch.py b/rllib/connectors/common/add_states_from_episodes_to_batch.py index 2c62466d84ab..e4e5bfa2641a 100644 --- a/rllib/connectors/common/add_states_from_episodes_to_batch.py +++ b/rllib/connectors/common/add_states_from_episodes_to_batch.py @@ -266,6 +266,15 @@ def __call__( item_list, max_seq_len=self._get_max_seq_len(rl_module, module_id=mid), ) + # TODO (sven): Remove this hint/hack once we are not relying on + # SampleBatch anymore (which has to set its property + # zero_padded=True when shuffling). + shared_data[ + ( + "_zero_padded_for_mid=" + f"{mid if mid is not None else DEFAULT_MODULE_ID}" + ) + ] = True for sa_episode in self.single_agent_episode_iterator( episodes, diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index c9e6aed75637..db3bfaa1eab8 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -985,8 +985,8 @@ def update_from_episodes( timesteps: Optional[Dict[str, Any]] = None, num_epochs: int = 1, minibatch_size: Optional[int] = None, - num_total_minibatches: int = 0, shuffle_batch_per_epoch: bool = False, + num_total_minibatches: int = 0, # Deprecated args. num_iters=DEPRECATED_VALUE, ) -> ResultDict: @@ -1040,8 +1040,8 @@ def update_from_episodes( return self._update_from_batch_or_episodes( episodes=episodes, timesteps=timesteps, - minibatch_size=minibatch_size, num_epochs=num_epochs, + minibatch_size=minibatch_size, shuffle_batch_per_epoch=shuffle_batch_per_epoch, num_total_minibatches=num_total_minibatches, ) @@ -1243,8 +1243,8 @@ def _update_from_batch_or_episodes( timesteps: Optional[Dict[str, Any]] = None, # TODO (sven): Deprecate these in favor of config attributes for only those # algos that actually need (and know how) to do minibatching. - minibatch_size: Optional[int] = None, num_epochs: int = 1, + minibatch_size: Optional[int] = None, shuffle_batch_per_epoch: bool = False, num_total_minibatches: int = 0, ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: @@ -1280,7 +1280,11 @@ def _update_from_batch_or_episodes( # TODO (sven): Try to not require MultiAgentBatch anymore. batch = MultiAgentBatch( { - module_id: SampleBatch(module_data) + module_id: ( + SampleBatch(module_data, _zero_padded=True) + if shared_data.get(f"_zero_padded_for_mid={module_id}") + else SampleBatch(module_data) + ) for module_id, module_data in batch.items() }, env_steps=sum(len(e) for e in episodes), @@ -1340,8 +1344,8 @@ def _update_from_batch_or_episodes( for tensor_minibatch in batch_iter( batch, - minibatch_size=minibatch_size, num_epochs=num_epochs, + minibatch_size=minibatch_size, shuffle_batch_per_epoch=shuffle_batch_per_epoch and (num_epochs > 1), num_total_minibatches=num_total_minibatches, ): diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index 07492a8b5611..eb8d2a3cb05d 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -222,8 +222,8 @@ def update_from_batch( async_update: bool = False, return_state: bool = False, num_epochs: int = 1, - shuffle_batch_per_epoch: bool = False, minibatch_size: Optional[int] = None, + shuffle_batch_per_epoch: bool = False, # User kwargs. **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index f07b7f23e64f..e27b5a7782ba 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -57,8 +57,8 @@ def __init__( batch: MultiAgentBatch, *, num_epochs: int = 1, - shuffle_batch_per_epoch: bool = True, minibatch_size: int, + shuffle_batch_per_epoch: bool = True, num_total_minibatches: int = 0, _uses_new_env_runners: bool = False, ) -> None: From 42535d49eba34217a7cd846a4c8e529e601fddba Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 4 Sep 2024 10:47:14 +0200 Subject: [PATCH 09/20] fix Signed-off-by: sven1977 --- rllib/execution/train_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py index bf930a00f5e2..5c207ced3a27 100644 --- a/rllib/execution/train_ops.py +++ b/rllib/execution/train_ops.py @@ -46,7 +46,9 @@ def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict: workers = algorithm.env_runner_group local_worker = workers.local_env_runner num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1)) - minibatch_size = config.get("minibatch_size", config.get("sgd_minibatch_size", 0)) + minibatch_size = config.get("minibatch_size") + if minibatch_size is None: + minibatch_size = config.get("sgd_minibatch_size", 0) learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER] with learn_timer: From 292c71fda65e1ac970c4574dc3717288b1355b75 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 4 Sep 2024 11:44:40 +0200 Subject: [PATCH 10/20] fix Signed-off-by: sven1977 --- rllib/execution/train_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py index 5c207ced3a27..2b2b76bc671e 100644 --- a/rllib/execution/train_ops.py +++ b/rllib/execution/train_ops.py @@ -117,7 +117,9 @@ def multi_gpu_train_one_step(algorithm, train_batch) -> Dict: workers = algorithm.env_runner_group local_worker = workers.local_env_runner num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1)) - minibatch_size = config.get("minibatch_size", config["train_batch_size"]) + minibatch_size = config.get("minibatch_size") + if minibatch_size is None: + minibatch_size = config["train_batch_size"] # Determine the number of devices (GPUs or 1 CPU) we use. num_devices = int(math.ceil(config["num_gpus"] or 1)) From 1f748f10c36ef6470bb611f4754c9277d719103d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 4 Sep 2024 12:51:21 +0200 Subject: [PATCH 11/20] fix Signed-off-by: sven1977 --- rllib/utils/tests/test_minibatch_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rllib/utils/tests/test_minibatch_utils.py b/rllib/utils/tests/test_minibatch_utils.py index 879e8b522a1b..0d6b53d060be 100644 --- a/rllib/utils/tests/test_minibatch_utils.py +++ b/rllib/utils/tests/test_minibatch_utils.py @@ -72,7 +72,8 @@ def test_minibatch_cyclic_iterator(self): ] ), "seq_lens": seq_lens, - } + }, + _zero_padded=padding, ) for i in range(len(agent_steps)) } @@ -86,7 +87,10 @@ def test_minibatch_cyclic_iterator(self): mb = MultiAgentBatch(sample_batches, num_env_steps) batch_iter = MiniBatchCyclicIterator( - mb, minibatch_size=minibatch_size, num_epochs=num_epochs + mb, + minibatch_size=minibatch_size, + num_epochs=num_epochs, + shuffle_batch_per_epoch=False, ) print(config) iteration_counter = 0 From c13647a6ec041a6dd6a10fd017295128c9682365 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 4 Sep 2024 13:16:03 +0200 Subject: [PATCH 12/20] fix Signed-off-by: sven1977 --- rllib/algorithms/appo/appo.py | 11 ++++------- rllib/algorithms/cql/cql.py | 8 ++++---- rllib/algorithms/marwil/marwil.py | 16 ++++++++-------- rllib/core/learner/learner.py | 9 +++++++-- rllib/core/learner/learner_group.py | 14 +++----------- 5 files changed, 26 insertions(+), 32 deletions(-) diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index ad21a9780dc3..22d2edfd2860 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -188,13 +188,10 @@ def training( target_network_update_freq: The frequency to update the target policy and tune the kl loss coefficients that are used during training. After setting this parameter, the algorithm waits for at least - `target_network_update_freq * minibatch_size * num_epochs` number of - samples to be trained on by the learner group before updating the target - networks and tuned the kl loss coefficients that are used during - training. - NOTE: This parameter is only applicable when using the Learner API - (enable_rl_module_and_learner=True). - + `target_network_update_freq` number of environment samples to be trained + on before updating the target networks and tune the kl loss + coefficients. NOTE: This parameter is only applicable when using the + Learner API (enable_rl_module_and_learner=True). Returns: This updated AlgorithmConfig object. diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py index cf6cb5f7e041..79e94ccf75a4 100644 --- a/rllib/algorithms/cql/cql.py +++ b/rllib/algorithms/cql/cql.py @@ -306,7 +306,7 @@ def _training_step_new_api_stack(self) -> ResultDict: # Sampling from offline data. with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)): # Return an iterator in case we are using remote learners. - batch = self.offline_data.sample( + batch_or_iterator = self.offline_data.sample( num_samples=self.config.train_batch_size_per_learner, num_shards=self.config.num_learners, return_iterator=self.config.num_learners > 1, @@ -315,9 +315,9 @@ def _training_step_new_api_stack(self) -> ResultDict: # Updating the policy. with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)): # TODO (simon, sven): Check, if we should execute directly s.th. like - # update_from_iterator. - learner_results = self.learner_group.update_from_batch( - batch, + # `LearnerGroup.update_from_iterator()`. + learner_results = self.learner_group._update( + batch=batch_or_iterator, minibatch_size=self.config.train_batch_size_per_learner, num_iters=self.config.dataset_num_iters_per_learner, ) diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py index 7dbe8c85566f..d73e074fdff9 100644 --- a/rllib/algorithms/marwil/marwil.py +++ b/rllib/algorithms/marwil/marwil.py @@ -380,12 +380,12 @@ class (multi-/single-learner setup) and evaluation on """ # Implement logic using RLModule and Learner API. # TODO (simon): Take care of sampler metrics: right - # now all rewards are `nan`, which possibly confuses - # the user that sth. is not right, although it is as - # we do not step the env. + # now all rewards are `nan`, which possibly confuses + # the user that sth. is not right, although it is as + # we do not step the env. with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)): # Sampling from offline data. - batch = self.offline_data.sample( + batch_or_iterator = self.offline_data.sample( num_samples=self.config.train_batch_size_per_learner, num_shards=self.config.num_learners, return_iterator=self.config.num_learners > 1, @@ -394,11 +394,11 @@ class (multi-/single-learner setup) and evaluation on with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)): # Updating the policy. # TODO (simon, sven): Check, if we should execute directly s.th. like - # update_from_iterator. - learner_results = self.learner_group.update_from_batch( - batch, + # `LearnerGroup.update_from_iterator()`. + learner_results = self.learner_group._update( + batch=batch_or_iterator, minibatch_size=self.config.train_batch_size_per_learner, - num_epochs=self.config.dataset_num_iters_per_learner, + num_iters=self.config.dataset_num_iters_per_learner, ) # Log training results. diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index db3bfaa1eab8..7e7ca16dbfa2 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -944,8 +944,7 @@ def update_from_batch( # TODO (sven): Make this a more formal structure with its own type. num_epochs: The number of complete passes over the entire train batch. Each pass might be further split into n minibatches (if `minibatch_size` - provided). The train batch is generated from the given `episodes` - through the Learner connector pipeline. + provided). minibatch_size: The size of minibatches to use to further split the train `batch` into sub-batches. The `batch` is then iterated over n times where n is `len(batch) // minibatch_size`. @@ -1055,6 +1054,12 @@ def update_from_iterator( num_iters: int = None, **kwargs, ): + if "num_epochs" in kwargs: + raise ValueError( + "`num_epochs` arg NOT supported by Learner.update_from_iterator! Use " + "`num_iters` instead." + ) + self._check_is_built() # minibatch_size = minibatch_size or 32 diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index eb8d2a3cb05d..fe4aa9cfd09c 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -35,11 +35,7 @@ ) from ray.rllib.utils.annotations import override from ray.rllib.utils.checkpoints import Checkpointable -from ray.rllib.utils.deprecation import ( - Deprecated, - DEPRECATED_VALUE, - deprecation_warning, -) +from ray.rllib.utils.deprecation import Deprecated from ray.rllib.utils.metrics.metrics_logger import MetricsLogger from ray.rllib.utils.minibatch_utils import ( ShardBatchIterator, @@ -352,16 +348,12 @@ def _update( async_update: bool = False, return_state: bool = False, num_epochs: int = 1, + num_iters: int = 1, minibatch_size: Optional[int] = None, shuffle_batch_per_epoch: bool = False, - # Deprecated args. - num_iters=DEPRECATED_VALUE, **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: - if num_iters != DEPRECATED_VALUE: - deprecation_warning(old="num_iters", new="num_epochs", error=True) - # Define function to be called on all Learner actors (or the local learner). def _learner_update( _learner: Learner, @@ -381,7 +373,7 @@ def _learner_update( iterator=_batch_shard, timesteps=_timesteps, minibatch_size=minibatch_size, - num_epochs=num_epochs, + num_iters=num_iters, **_kwargs, ) elif _batch_shard is not None: From cd3869512063e34c848801ea4a2b3cfad6173a9f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 4 Sep 2024 14:26:32 +0200 Subject: [PATCH 13/20] fixes Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 10 ++++++++++ rllib/algorithms/tests/test_algorithm_config.py | 2 +- ...te_modelv2_to_new_api_stack_by_policy_checkpoint.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index cebdd95e0006..1044757b7290 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -2083,6 +2083,8 @@ def training( ] = NotProvided, add_default_connectors_to_learner_pipeline: Optional[bool] = NotProvided, learner_config_dict: Optional[Dict[str, Any]] = NotProvided, + # Deprecated args. + num_sgd_iter=DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Sets the training related configuration. @@ -2187,6 +2189,14 @@ def training( Returns: This updated AlgorithmConfig object. """ + if num_sgd_iter is not NotProvided: + deprecation_warning( + old="config.training(num_sgd_iter=..)", + new="config.training(num_epochs=..)", + error=False, + ) + num_epochs = num_sgd_iter + if gamma is not NotProvided: self.gamma = gamma if lr is not NotProvided: diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index 03ec44a9aad9..9f81bd7abd9d 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -30,7 +30,7 @@ def test_running_specific_algo_with_generic_config(self): config = ( AlgorithmConfig(algo_class=PPO) .environment("CartPole-v0") - .training(lr=0.12345, train_batch_size=3000) + .training(lr=0.12345, train_batch_size=3000, minibatch_size=300) ) algo = config.build() self.assertTrue(algo.config.lr == 0.12345) diff --git a/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py b/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py index 5de20eee0f52..d67195f86a64 100644 --- a/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py +++ b/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py @@ -26,7 +26,7 @@ .environment("CartPole-v1") .training( lr=0.0003, - num_sgd_iter=6, + num_epochs=6, vf_loss_coeff=0.01, ) ) From 4f36d7af366e4545ffddc81abfeac2dc63a6a69b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 4 Sep 2024 14:35:37 +0200 Subject: [PATCH 14/20] fix Signed-off-by: sven1977 --- rllib/policy/dynamic_tf_policy.py | 9 +++++---- rllib/policy/dynamic_tf_policy_v2.py | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index efd7b4024131..ac40205de94a 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -617,10 +617,11 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): ) # Get the correct slice of the already loaded batch to use, # based on offset and batch size. - batch_size = self.config.get( - "minibatch_size", - self.config.get("sgd_minibatch_size", self.config["train_batch_size"]), - ) + batch_size = self.config.get("minibatch_size") + if batch_size is None: + batch_size = self.config.get( + "sgd_minibatch_size", self.config["train_batch_size"] + ) if batch_size >= len(self._loaded_single_cpu_batch): sliced_batch = self._loaded_single_cpu_batch else: diff --git a/rllib/policy/dynamic_tf_policy_v2.py b/rllib/policy/dynamic_tf_policy_v2.py index f11cba1ee57d..e2ad3d6da0ab 100644 --- a/rllib/policy/dynamic_tf_policy_v2.py +++ b/rllib/policy/dynamic_tf_policy_v2.py @@ -1003,10 +1003,11 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): ) # Get the correct slice of the already loaded batch to use, # based on offset and batch size. - batch_size = self.config.get( - "minibatch_size", - self.config.get("sgd_minibatch_size", self.config["train_batch_size"]), - ) + batch_size = self.config.get("minibatch_size") + if batch_size is None: + batch_size = self.config.get( + "sgd_minibatch_size", self.config["train_batch_size"] + ) if batch_size >= len(self._loaded_single_cpu_batch): sliced_batch = self._loaded_single_cpu_batch From 927ba3d1b81ec56a9214300bb1b072d0aa130300 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 4 Sep 2024 16:02:45 +0200 Subject: [PATCH 15/20] fix Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 2 +- rllib/tests/test_lstm.py | 71 ---------------------------- 2 files changed, 1 insertion(+), 72 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 1044757b7290..53377b1637d2 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -2189,7 +2189,7 @@ def training( Returns: This updated AlgorithmConfig object. """ - if num_sgd_iter is not NotProvided: + if num_sgd_iter != DEPRECATED_VALUE: deprecation_warning( old="config.training(num_sgd_iter=..)", new="config.training(num_epochs=..)", diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py index eda9a0c3e440..969683d8ca38 100644 --- a/rllib/tests/test_lstm.py +++ b/rllib/tests/test_lstm.py @@ -173,77 +173,6 @@ def setUp(self) -> None: def tearDown(self) -> None: ray.shutdown() - def test_simple_optimizer_sequencing(self): - ModelCatalog.register_custom_model("rnn", RNNSpyModel) - register_env("counter", lambda _: DebugCounterEnv()) - config = ( - PPOConfig() - .environment("counter") - .framework("tf") - .env_runners(num_env_runners=0, rollout_fragment_length=10) - .training( - train_batch_size=10, - minibatch_size=10, - num_epochs=1, - model={ - "custom_model": "rnn", - "max_seq_len": 4, - "vf_share_layers": True, - }, - ) - ) - # Force-set simple_optimizer (fully deprecated soon). - config.simple_optimizer = True - ppo = config.build() - ppo.train() - ppo.train() - ppo.stop() - - batch0 = pickle.loads( - ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0") - ) - self.assertEqual( - batch0["sequences"].tolist(), - [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]], - ) - self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) - self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) - self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) - self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) - self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) - self.assertTrue( - np.allclose( - batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1] - ) - ) - self.assertTrue( - np.allclose( - batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1] - ) - ) - - batch1 = pickle.loads( - ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1") - ) - self.assertEqual( - batch1["sequences"].tolist(), - [ - [[10], [11], [12], [13]], - [[14], [0], [0], [0]], - [[0], [1], [2], [3]], - [[4], [0], [0], [0]], - ], - ) - self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [4, 1, 4, 1]) - self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) - self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) - self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) - self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) - self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) - self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) - self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) - self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0) - def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) From 3264f9c7911b40d830db5d82eac20e7e1074cf28 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 5 Sep 2024 11:45:30 +0200 Subject: [PATCH 16/20] APPO stateless cartpole not learning Signed-off-by: sven1977 --- rllib/policy/sample_batch.py | 7 ++++++- .../appo/multi_agent_stateless_cartpole_appo.py | 2 +- rllib/tuned_examples/appo/stateless_cartpole_appo.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py index 33a0b5eea25b..36abaa36ad76 100644 --- a/rllib/policy/sample_batch.py +++ b/rllib/policy/sample_batch.py @@ -733,7 +733,12 @@ def _batch_slice(self, slice_: slice) -> "SampleBatch": infos = self.pop(SampleBatch.INFOS, None) data = tree.map_structure(lambda value: value[start:stop], self) if infos is not None: - data[SampleBatch.INFOS] = infos[start:stop] + # Slice infos according to SEQ_LENS. + info_slice_start = int(sum(self[SampleBatch.SEQ_LENS][:start])) + info_slice_stop = int(sum(self[SampleBatch.SEQ_LENS][start:stop])) + data[SampleBatch.INFOS] = infos[info_slice_start:info_slice_stop] + # Put infos back into `self`. + self[Columns.INFOS] = infos return SampleBatch( data, diff --git a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py index 4fb553a7b4fc..117ddeb32bd7 100644 --- a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py +++ b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py @@ -36,7 +36,7 @@ .training( train_batch_size_per_learner=600, lr=0.0005 * ((args.num_gpus or 1) ** 0.5), - num_epochs=6, + num_epochs=1, vf_loss_coeff=0.05, grad_clip=20.0, ) diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo.py b/rllib/tuned_examples/appo/stateless_cartpole_appo.py index 3763d91c9109..99421ee58bf0 100644 --- a/rllib/tuned_examples/appo/stateless_cartpole_appo.py +++ b/rllib/tuned_examples/appo/stateless_cartpole_appo.py @@ -29,7 +29,7 @@ ) .training( lr=0.0005 * ((args.num_gpus or 1) ** 0.5), - num_epochs=6, + num_epochs=1, vf_loss_coeff=0.05, grad_clip=20.0, ) From 804bfc24ae9712da1c96e8949cffbcb2f973c5d7 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 5 Sep 2024 13:20:20 +0200 Subject: [PATCH 17/20] wip Signed-off-by: sven1977 --- rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py index 42f0398a97bd..6ef4f2dcbfaf 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py @@ -14,7 +14,7 @@ num_agents=2, ) # Use `parser` to add your own custom command line options to this script -# and (if needed) use their values toset up `config` below. +# and (if needed) use their values to set up `config` below. args = parser.parse_args() register_env("multi_agent_pendulum", lambda cfg: MultiAgentPendulum(config=cfg)) @@ -26,7 +26,6 @@ enable_env_runner_and_connector_v2=True, ) .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents}) - .env_runners(num_env_runners=4) .training( lr=0.0003, lambda_=0.1, From a79630ac5d5bb3a0a99dd21828c4a0d45b83d130 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 5 Sep 2024 14:33:49 +0200 Subject: [PATCH 18/20] more ts for pendulum PPO Signed-off-by: sven1977 --- rllib/tuned_examples/ppo/pendulum_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py index 5df6e3e78855..a401ad720867 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo.py @@ -1,7 +1,7 @@ from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args(default_timesteps=400000, default_reward=-300) +parser = add_rllib_example_script_args(default_timesteps=600000, default_reward=-300) parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. From 7bdab986e866d0df7519b0c605b934c369169099 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 17 Sep 2024 11:46:12 +0200 Subject: [PATCH 19/20] better PPO Pendulum tuned examples. Signed-off-by: sven1977 --- rllib/algorithms/ppo/ppo.py | 2 +- .../add_states_from_episodes_to_batch.py | 6 +++--- .../ppo/multi_agent_pendulum_ppo.py | 13 +++++++++---- rllib/tuned_examples/ppo/pendulum_ppo.py | 18 +++++++++++------- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 558983875df6..459bb779db52 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -238,7 +238,7 @@ def training( lambda_: The lambda parameter for General Advantage Estimation (GAE). Defines the exponential weight used between actually measured rewards vs value function estimates over multiple time steps. Specifically, - `lambda_` balances short-term, low-variance estimates with longer-term, + `lambda_` balances short-term, low-variance estimates against long-term, high-variance returns. A `lambda_` of 0.0 makes the GAE rely only on immediate rewards (and vf predictions from there on, reducing variance, but increasing bias), while a `lambda_` of 1.0 only incorporates vf diff --git a/rllib/connectors/common/add_states_from_episodes_to_batch.py b/rllib/connectors/common/add_states_from_episodes_to_batch.py index e4e5bfa2641a..69cf509dab54 100644 --- a/rllib/connectors/common/add_states_from_episodes_to_batch.py +++ b/rllib/connectors/common/add_states_from_episodes_to_batch.py @@ -186,7 +186,6 @@ def __init__( input_observation_space: Optional[gym.Space] = None, input_action_space: Optional[gym.Space] = None, *, - max_seq_len: Optional[int] = None, as_learner_connector: bool = False, **kwargs, ): @@ -323,14 +322,15 @@ def __call__( self.add_n_batch_items( batch=batch, column=Columns.STATE_IN, - # items_to_add.shape=(B,[state-dim]) # B=episode len // max_seq_len + # items_to_add.shape=(B,[state-dim]) + # B=episode len // max_seq_len items_to_add=tree.map_structure( # Explanation: # [::max_seq_len]: only keep every Tth state. # [:-1]: Shift state outs by one, ignore very last # STATE_OUT (but therefore add the lookback/init state at # the beginning). - lambda i, o: np.concatenate([[i], o[:-1]])[::max_seq_len], + lambda i, o, m=max_seq_len: np.concatenate([[i], o[:-1]])[::m], look_back_state, state_outs, ), diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py index 6ef4f2dcbfaf..310fcad8d7cd 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py @@ -1,4 +1,5 @@ from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.connectors.env_to_module import MeanStdFilter from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, @@ -26,11 +27,15 @@ enable_env_runner_and_connector_v2=True, ) .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents}) + .env_runners( + env_to_module_connector=lambda env: MeanStdFilter(), + ) .training( - lr=0.0003, - lambda_=0.1, - vf_clip_param=10.0, - num_epochs=6, + train_batch_size_per_learner=1024, + minibatch_size=128, + lr=0.0002 * (args.num_gpus or 1) ** 0.5, + gamma=0.95, + lambda_=0.5, ) .rl_module( model_config_dict={ diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py index a401ad720867..e34ad094eed8 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo.py @@ -1,7 +1,8 @@ from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.connectors.env_to_module import MeanStdFilter from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args(default_timesteps=600000, default_reward=-300) +parser = add_rllib_example_script_args(default_timesteps=400000, default_reward=-300) parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. @@ -9,16 +10,19 @@ config = ( PPOConfig() + .environment("Pendulum-v1") .env_runners( num_env_runners=2, - num_envs_per_env_runner=10, + num_envs_per_env_runner=20, + env_to_module_connector=lambda env: MeanStdFilter(), ) - .environment("Pendulum-v1") .training( - lr=0.0003, - lambda_=0.1, - vf_clip_param=10.0, - num_epochs=6, + train_batch_size_per_learner=1024, + minibatch_size=128, + lr=0.0002 * (args.num_gpus or 1) ** 0.5, + gamma=0.95, + lambda_=0.5, + # num_epochs=8, ) .rl_module( model_config_dict={ From c26ae5d43cd37ccb4e3b1a5eff5714c7538241a4 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 17 Sep 2024 12:45:18 +0200 Subject: [PATCH 20/20] fix Signed-off-by: sven1977 --- rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py index 310fcad8d7cd..ba2c94d0f408 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py @@ -28,7 +28,7 @@ ) .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents}) .env_runners( - env_to_module_connector=lambda env: MeanStdFilter(), + env_to_module_connector=lambda env: MeanStdFilter(multi_agent=True), ) .training( train_batch_size_per_learner=1024,