From 596a4d88ab27cbecf4b8d10ff0cc45d304f9b026 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 30 Aug 2024 21:15:58 +0200
Subject: [PATCH 01/20] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/ppo/ppo.py                   | 17 +++++++----
 rllib/core/learner/learner.py                 |  2 ++
 rllib/policy/sample_batch.py                  | 14 +++++++--
 rllib/tests/test_lstm.py                      |  1 -
 .../ppo/multi_agent_pendulum_ppo.py           | 17 +++++------
 rllib/tuned_examples/ppo/pendulum_ppo.py      | 29 +++++--------------
 rllib/utils/minibatch_utils.py                |  9 ++++++
 rllib/utils/test_utils.py                     |  6 ++--
 8 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index fd261c44309c..2fa7c2d0d589 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -142,7 +142,7 @@ def __init__(self, algo_class=None):
         # Simple logic for now: If None, use `train_batch_size`.
         self.mini_batch_size_per_learner = None
         self.num_sgd_iter = 30
-        self.shuffle_sequences = True
+        self.shuffle_single_agent_batch = True
         self.vf_loss_coeff = 1.0
         self.entropy_coeff = 0.0
         self.entropy_coeff_schedule = None
@@ -220,7 +220,7 @@ def training(
         mini_batch_size_per_learner: Optional[int] = NotProvided,
         sgd_minibatch_size: Optional[int] = NotProvided,
         num_sgd_iter: Optional[int] = NotProvided,
-        shuffle_sequences: Optional[bool] = NotProvided,
+        shuffle_single_agent_batch: Optional[bool] = NotProvided,
         vf_loss_coeff: Optional[float] = NotProvided,
         entropy_coeff: Optional[float] = NotProvided,
         entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided,
@@ -260,8 +260,13 @@ def training(
                 new API stack (use `mini_batch_size_per_learner` instead).
             num_sgd_iter: Number of SGD iterations in each outer loop (i.e., number of
                 epochs to execute per train batch).
-            shuffle_sequences: Whether to shuffle sequences in the batch when training
-                (recommended).
+            shuffle_single_agent_batch: Whether to shuffle each single-agent batch once
+                before a new epoch (which consists of n x minibatches, where n is
+                `batch_size_per_learner` // `mini_batch_size_per_learner`). This should
+                be set to True in single-agent and independent multi-agent cases as it
+                ensures proper mixing of the samples before each batch epoch. Otherwise,
+                the sequence of minibatches iterated through is the same in each
+                iteration, possibly impacting learning.
             vf_loss_coeff: Coefficient of the value function loss. IMPORTANT: you must
                 tune this if you set vf_share_layers=True inside your model's config.
             entropy_coeff: The entropy coefficient (float) or entropy coefficient
@@ -302,8 +307,8 @@ def training(
             self.sgd_minibatch_size = sgd_minibatch_size
         if num_sgd_iter is not NotProvided:
             self.num_sgd_iter = num_sgd_iter
-        if shuffle_sequences is not NotProvided:
-            self.shuffle_sequences = shuffle_sequences
+        if shuffle_single_agent_batch is not NotProvided:
+            self.shuffle_single_agent_batch = shuffle_single_agent_batch
         if vf_loss_coeff is not NotProvided:
             self.vf_loss_coeff = vf_loss_coeff
         if entropy_coeff is not NotProvided:
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index 01d2c11da3b6..98b15a5cab18 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -1300,6 +1300,7 @@ def _update_from_batch_or_episodes(
                     MiniBatchCyclicIterator,
                     uses_new_env_runners=True,
                     num_total_mini_batches=num_total_mini_batches,
+                    shuffle=self.config.shuffle_single_agent_batch,
                 )
             else:
                 batch_iter = MiniBatchCyclicIterator
@@ -1310,6 +1311,7 @@ def _update_from_batch_or_episodes(
             # this behavior here by setting the minibatch size to be the size
             # of the batch (e.g. 1 minibatch of size batch.count)
             minibatch_size = batch.count
+            # Note that there is no need to shuffle here, b/c we don't have minibatches.
             batch_iter = MiniBatchCyclicIterator
         else:
             # `minibatch_size` and `num_iters` are not set by the user.
diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py
index 8ed604728fc7..098ddc2218ad 100644
--- a/rllib/policy/sample_batch.py
+++ b/rllib/policy/sample_batch.py
@@ -462,23 +462,31 @@ def shuffle(self) -> "SampleBatch":
 
             {"a": [4, 1, 3, 2]}
         """
+        has_time_rank = self.get(SampleBatch.SEQ_LENS) is not None
 
         # Shuffling the data when we have `seq_lens` defined is probably
         # a bad idea!
-        if self.get(SampleBatch.SEQ_LENS) is not None:
+        if has_time_rank and not self.zero_padded:
             raise ValueError(
                 "SampleBatch.shuffle not possible when your data has "
-                "`seq_lens` defined!"
+                "`seq_lens` defined AND is not zero-padded yet!"
             )
 
         # Get a permutation over the single items once and use the same
         # permutation for all the data (otherwise, data would become
         # meaningless).
-        permutation = np.random.permutation(self.count)
+        # - Shuffle by individual item.
+        if not has_time_rank:
+            permutation = np.random.permutation(self.count)
+        # - Shuffle along batch axis (leave axis=1/time-axis as-is).
+        else:
+            permutation = np.random.permutation(len(self[SampleBatch.SEQ_LENS]))
 
         self_as_dict = dict(self)
         shuffled = tree.map_structure(lambda v: v[permutation], self_as_dict)
+
         self.update(shuffled)
+
         # Flush cache such that intercepted values are recalculated after the
         # shuffling.
         self.intercepted_values = {}
diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py
index 245d3db9b055..d93951be0f67 100644
--- a/rllib/tests/test_lstm.py
+++ b/rllib/tests/test_lstm.py
@@ -261,7 +261,6 @@ def test_minibatch_sequencing(self):
                     "max_seq_len": 4,
                     "vf_share_layers": True,
                 },
-                shuffle_sequences=False,  # for deterministic testing
             )
         )
         ppo = config.build()
diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
index 757f6bbda8c1..082d505efcce 100644
--- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
@@ -26,20 +26,19 @@
         enable_env_runner_and_connector_v2=True,
     )
     .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents})
+    .env_runners(num_env_runners=4)
+    .training(
+        lr=0.0003,
+        lambda_=0.1,
+        vf_clip_param=10.0,
+        num_sgd_iter=6,
+    )
     .rl_module(
         model_config_dict={
             "fcnet_activation": "relu",
             "uses_new_env_runners": True,
         },
     )
-    .training(
-        train_batch_size=512,
-        lambda_=0.1,
-        gamma=0.95,
-        lr=0.0003,
-        sgd_minibatch_size=64,
-        vf_clip_param=10.0,
-    )
     .multi_agent(
         policy_mapping_fn=lambda aid, *arg, **kw: f"p{aid}",
         policies={f"p{i}" for i in range(args.num_agents)},
@@ -49,7 +48,7 @@
 stop = {
     NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
     # Divide by num_agents to get actual return per agent.
-    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0 * (args.num_agents or 1),
+    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -300.0 * (args.num_agents or 1),
 }
 
 
diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py
index b74dfb5db827..84c0ddd74f90 100644
--- a/rllib/tuned_examples/ppo/pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/pendulum_ppo.py
@@ -1,13 +1,7 @@
 from ray.rllib.algorithms.ppo import PPOConfig
-from ray.rllib.utils.metrics import (
-    ENV_RUNNER_RESULTS,
-    EPISODE_RETURN_MEAN,
-    EVALUATION_RESULTS,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
-)
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
-parser = add_rllib_example_script_args()
+parser = add_rllib_example_script_args(default_timesteps=400000, default_reward=-300.0)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.
@@ -22,34 +16,25 @@
     )
     .env_runners(
         num_env_runners=2,
-        num_envs_per_env_runner=20,
+        num_envs_per_env_runner=10,
     )
     .environment("Pendulum-v1")
     .training(
-        train_batch_size_per_learner=512,
-        gamma=0.95,
         lr=0.0003,
         lambda_=0.1,
         vf_clip_param=10.0,
-        sgd_minibatch_size=64,
-        model={
+        num_sgd_iter=6,
+    )
+    .rl_module(
+        model_config_dict={
             "fcnet_activation": "relu",
             "uses_new_env_runners": True,
         },
     )
-    .evaluation(
-        evaluation_num_env_runners=1,
-        evaluation_interval=1,
-        evaluation_parallel_to_training=True,
-    )
 )
 
-stop = {
-    NUM_ENV_STEPS_SAMPLED_LIFETIME: 400000,
-    f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0,
-}
 
 if __name__ == "__main__":
     from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
 
-    run_rllib_example_script_experiment(config, args, stop=stop)
+    run_rllib_example_script_experiment(config, args)
diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
index 1fccb2e2fb0c..883d08d84ade 100644
--- a/rllib/utils/minibatch_utils.py
+++ b/rllib/utils/minibatch_utils.py
@@ -50,6 +50,7 @@ def __init__(
         num_iters: int = 1,
         uses_new_env_runners: bool = False,
         num_total_mini_batches: int = 0,
+        shuffle: bool = False,
     ) -> None:
         super().__init__(batch, minibatch_size, num_iters)
         self._batch = batch
@@ -66,6 +67,8 @@ def __init__(
         self._mini_batch_count = 0
         self._num_total_mini_batches = num_total_mini_batches
 
+        self._shuffle = shuffle
+
     def __iter__(self):
         while (
             # Make sure each item in the total batch gets at least iterated over
@@ -83,6 +86,12 @@ def __iter__(self):
             minibatch = {}
             for module_id, module_batch in self._batch.policy_batches.items():
 
+                # Shuffle the individual single-agent batch, if required.
+                # This should happen once per minibatch iteration in order to make
+                # each iteration go through a different set of minibatches.
+                if self._shuffle:
+                    module_batch.shuffle()
+
                 if len(module_batch) == 0:
                     raise ValueError(
                         f"The batch for module_id {module_id} is empty! "
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 8925024ee764..dd90ca5fedc6 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1425,10 +1425,10 @@ def run_rllib_example_script_experiment(
         for i in range(stop.get(TRAINING_ITERATION, args.stop_iters)):
             results = algo.train()
             if ENV_RUNNER_RESULTS in results:
-                print(
-                    f"iter={i} R={results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}",
-                    end="",
+                mean_return = results[ENV_RUNNER_RESULTS].get(
+                    EPISODE_RETURN_MEAN, np.nan
                 )
+                print(f"iter={i} R={mean_return}", end="")
             if EVALUATION_RESULTS in results:
                 Reval = results[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][
                     EPISODE_RETURN_MEAN

From 06ec0d1879f053a2db827cf2d8bb762239ab40a2 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 2 Sep 2024 10:10:09 +0200
Subject: [PATCH 02/20] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py          |  26 ++++
 rllib/algorithms/appo/appo.py                 |  19 ++-
 rllib/algorithms/appo/appo_learner.py         |   6 +-
 rllib/algorithms/impala/impala.py             |  89 +++++++-------
 rllib/algorithms/impala/impala_learner.py     |  26 +++-
 rllib/algorithms/marwil/marwil.py             |   2 +-
 rllib/algorithms/ppo/ppo.py                   |  77 +++---------
 rllib/algorithms/ppo/tests/test_ppo.py        |   6 +-
 .../ppo/tests/test_ppo_with_env_runner.py     |   2 +-
 rllib/algorithms/ppo/tests/test_repro_ppo.py  |   4 +-
 rllib/algorithms/tests/test_algorithm.py      |   8 +-
 .../test_algorithm_checkpoint_restore.py      |   2 +-
 .../tests/test_algorithm_rl_module_restore.py |   4 +-
 ..._algorithm_save_load_checkpoint_learner.py |   2 +-
 .../tests/test_callbacks_old_stack.py         |   2 +-
 .../tests/test_callbacks_on_env_runner.py     |   8 +-
 .../algorithms/tests/test_worker_failures.py  |  10 +-
 .../run_ppo_with_inference_bm.py              |   4 +-
 rllib/core/learner/learner.py                 | 112 +++++++++---------
 rllib/core/learner/learner_group.py           |  81 +++++--------
 rllib/env/tests/test_multi_agent_env.py       |   4 +-
 .../evaluation/tests/test_envs_that_crash.py  |   2 +-
 rllib/evaluation/tests/test_rollout_worker.py |   2 +-
 .../tests/test_trajectory_view_api.py         |   2 +-
 .../examples/actions/nested_action_spaces.py  |   2 +-
 ...raining_step_on_and_off_policy_combined.py |   2 +-
 rllib/examples/cartpole_lstm.py               |   2 +-
 .../examples/catalogs/mobilenet_v2_encoder.py |   2 +-
 .../restore_1_of_n_agents_from_checkpoint.py  |   2 +-
 .../flatten_observations_dict_space.py        |   2 +-
 rllib/examples/connectors/frame_stacking.py   |   4 +-
 .../examples/connectors/mean_std_filtering.py |   2 +-
 .../connectors/prev_actions_prev_rewards.py   |   2 +-
 .../curiosity/count_based_curiosity.py        |   2 +-
 ...trinsic_curiosity_model_based_curiosity.py |   2 +-
 .../curriculum/curriculum_learning.py         |   2 +-
 .../debugging/deterministic_training.py       |   2 +-
 .../envs/env_rendering_and_recording.py       |   2 +-
 rllib/examples/envs/greyscale_env.py          |   4 +-
 rllib/examples/envs/unity3d_env_local.py      |   4 +-
 .../gpus/float16_training_and_inference.py    |   2 +-
 ...ed_precision_training_float16_inference.py |   2 +-
 ...cy_inference_after_training_w_connector.py |   2 +-
 .../learners/custom_loss_fn_simple.py         |   2 +-
 .../learners/separate_vf_lr_and_optimizer.py  |   2 +-
 .../learners/train_w_bc_finetune_w_ppo.py     |   2 +-
 .../multi_agent/multi_agent_pendulum.py       |   2 +-
 .../self_play_league_based_with_open_spiel.py |   2 +-
 .../multi_agent/self_play_with_open_spiel.py  |   4 +-
 rllib/examples/multi_agent/two_algorithms.py  |   2 +-
 rllib/examples/quadx_waypoints.py             |   2 +-
 .../rl_modules/classes/lstm_containing_rlm.py |   2 +-
 .../rl_modules/classes/mobilenet_rlm.py       |   2 +-
 .../rl_modules/classes/tiny_atari_cnn_rlm.py  |   2 +-
 .../rl_modules/custom_lstm_rl_module.py       |   2 +-
 rllib/execution/train_ops.py                  |  16 +--
 rllib/models/tests/test_attention_nets.py     |   6 +-
 rllib/models/tests/test_lstms.py              |   4 +-
 rllib/models/tests/test_preprocessors.py      |   6 +-
 rllib/policy/dynamic_tf_policy.py             |   7 +-
 rllib/policy/dynamic_tf_policy_v2.py          |   4 +-
 rllib/policy/torch_policy.py                  |   2 +-
 rllib/policy/torch_policy_v2.py               |   3 +-
 .../checkpoints/create_checkpoints.py         |   2 +-
 rllib/tests/test_io.py                        |   2 +-
 rllib/tests/test_lstm.py                      |   8 +-
 rllib/tests/test_nested_observation_spaces.py |   8 +-
 rllib/tests/test_supported_multi_agent.py     |   2 +-
 rllib/tests/test_supported_spaces.py          |   8 +-
 .../appo/cartpole-appo-separate-losses.py     |   2 +-
 rllib/tuned_examples/appo/cartpole-appo.yaml  |   2 +-
 rllib/tuned_examples/appo/cartpole_appo.py    |  21 ++--
 .../appo/frozenlake-appo-vtrace.yaml          |   2 +-
 .../tuned_examples/appo/halfcheetah-appo.yaml |   2 +-
 ...ulti-agent-cartpole-w-100-policies-appo.py |   2 +-
 ...multi_agent_cartpole_appo_old_api_stack.py |   2 +-
 .../multi_agent_stateless_cartpole_appo.py    |   7 +-
 rllib/tuned_examples/appo/pendulum-appo.yaml  |   2 +-
 .../pong-appo-w-rl-modules-and-learner.yaml   |   2 +-
 rllib/tuned_examples/appo/pong-appo.yaml      |   2 +-
 .../appo/stateless_cartpole_appo.py           |   2 +-
 rllib/tuned_examples/bc/cartpole_recording.py |   2 +-
 .../compact-regression-test.yaml              |   8 +-
 .../impala/cartpole-impala-separate-losses.py |   2 +-
 ...lti_agent_cartpole_impala_old_api_stack.py |   2 +-
 rllib/tuned_examples/ppo/atari_ppo.py         |   4 +-
 .../ppo/benchmark_ppo_mujoco.py               |   4 +-
 .../ppo/benchmark_ppo_mujoco_pb2.py           |  10 +-
 rllib/tuned_examples/ppo/cartpole-ppo.yaml    |   2 +-
 rllib/tuned_examples/ppo/cartpole_ppo.py      |   2 +-
 .../ppo/cartpole_truncated_ppo.py             |   2 +-
 rllib/tuned_examples/ppo/halfcheetah-ppo.yaml |   4 +-
 rllib/tuned_examples/ppo/hopper-ppo.yaml      |   4 +-
 .../tuned_examples/ppo/humanoid-ppo-gae.yaml  |   4 +-
 rllib/tuned_examples/ppo/humanoid-ppo.yaml    |   4 +-
 .../ppo/memory-leak-test-ppo.yaml             |   4 +-
 .../ppo/memory_leak_test_ppo_new_stack.py     |   2 +-
 .../ppo/multi_agent_cartpole_ppo.py           |   2 +-
 .../ppo/multi_agent_pendulum_ppo.py           |   2 +-
 .../ppo/multi_agent_stateless_cartpole_ppo.py |   2 +-
 rllib/tuned_examples/ppo/pendulum-ppo.yaml    |   2 +-
 .../ppo/pendulum-transformed-actions-ppo.yaml |   4 +-
 rllib/tuned_examples/ppo/pendulum_ppo.py      |   2 +-
 .../ppo/repeatafterme-ppo-lstm.yaml           |   2 +-
 .../ppo/stateless_cartpole_ppo.py             |   2 +-
 ...unity3d-soccer-strikers-vs-goalie-ppo.yaml |   4 +-
 rllib/tuned_examples/ppo/walker2d-ppo.yaml    |   4 +-
 .../utils/exploration/tests/test_curiosity.py |   2 +-
 rllib/utils/minibatch_utils.py                |  95 ++++++++-------
 rllib/utils/tests/test_minibatch_utils.py     |  44 +++----
 110 files changed, 454 insertions(+), 469 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 5dd8de0bcc0f..0cedad5da905 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -382,6 +382,13 @@ def __init__(self, algo_class: Optional[type] = None):
         # Simple logic for now: If None, use `train_batch_size`.
         self.train_batch_size_per_learner = None
         self.train_batch_size = 32  # @OldAPIStack
+
+        # These setting have been adopted from the original PPO batch settings:
+        # num_sgd_iter, minibatch_size, and shuffle_sequences.
+        self.num_epochs = 1
+        self.shuffle_batch_per_epoch = False
+        self.minibatch_size = None
+
         # TODO (sven): Unsolved problem with RLModules sometimes requiring settings from
         #  the main AlgorithmConfig. We should not require the user to provide those
         #  settings in both, the AlgorithmConfig (as property) AND the model config
@@ -2047,6 +2054,9 @@ def training(
         grad_clip_by: Optional[str] = NotProvided,
         train_batch_size: Optional[int] = NotProvided,
         train_batch_size_per_learner: Optional[int] = NotProvided,
+        num_epochs: Optional[int] = NotProvided,
+        shuffle_batch_per_epoch: Optional[bool] = NotProvided,
+        minibatch_size: Optional[int] = NotProvided,
         model: Optional[dict] = NotProvided,
         optimizer: Optional[dict] = NotProvided,
         max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided,
@@ -2105,6 +2115,15 @@ def training(
                 stack, this setting should no longer be used. Instead, use
                 `train_batch_size_per_learner` (in combination with
                 `num_learners`).
+            num_epochs: The number of complete passes over the entire train batch (per
+                Learner). Each pass might be further split into n minibatches (if
+                `minibatch_size` provided).
+            shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
+                If the train batch has a time rank (axis=1), shuffling will only take
+                place along the batch axis to not disturb any intact (episode)
+                trajectories.
+            minibatch_size: The size of minibatches to use to further split the train
+                batch into.
             model: Arguments passed into the policy model. See models/catalog.py for a
                 full list of the available model options.
                 TODO: Provide ModelConfig objects instead of dicts.
@@ -2168,6 +2187,13 @@ def training(
             self.train_batch_size_per_learner = train_batch_size_per_learner
         if train_batch_size is not NotProvided:
             self.train_batch_size = train_batch_size
+        if num_epochs is not NotProvided:
+            self.num_epochs = num_epochs
+        if shuffle_batch_per_epoch is not NotProvided:
+            self.shuffle_batch_per_epoch = shuffle_batch_per_epoch
+        if minibatch_size is not NotProvided:
+            self.minibatch_size = minibatch_size
+
         if model is not NotProvided:
             self.model.update(model)
             if (
diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py
index 570e40087f98..73ceef6f3264 100644
--- a/rllib/algorithms/appo/appo.py
+++ b/rllib/algorithms/appo/appo.py
@@ -102,18 +102,11 @@ def __init__(self, algo_class=None):
         # Override some of IMPALAConfig's default values with APPO-specific values.
         self.num_env_runners = 2
         self.min_time_s_per_iteration = 10
-        self.num_gpus = 0
-        self.num_multi_gpu_tower_stacks = 1
-        self.minibatch_buffer_size = 1
-        self.num_sgd_iter = 1
         self.target_network_update_freq = 1
-        self.replay_proportion = 0.0
-        self.replay_buffer_num_slots = 100
         self.learner_queue_size = 16
         self.learner_queue_timeout = 300
         self.max_sample_requests_in_flight_per_worker = 2
         self.broadcast_interval = 1
-
         self.grad_clip = 40.0
         # Note: Only when using enable_rl_module_and_learner=True can the clipping mode
         # be configured by the user. On the old API stack, RLlib will always clip by
@@ -140,6 +133,12 @@ def __init__(self, algo_class=None):
             # Add constructor kwargs here (if any).
         }
 
+        self.num_gpus = 0  # @OldAPIStack
+        self.num_multi_gpu_tower_stacks = 1  # @OldAPIStack
+        self.minibatch_buffer_size = 1  # @OldAPIStack
+        self.replay_proportion = 0.0  # @OldAPIStack
+        self.replay_buffer_num_slots = 100  # @OldAPIStack
+
         # __sphinx_doc_end__
         # fmt: on
 
@@ -185,7 +184,7 @@ def training(
             target_network_update_freq: The frequency to update the target policy and
                 tune the kl loss coefficients that are used during training. After
                 setting this parameter, the algorithm waits for at least
-                `target_network_update_freq * minibatch_size * num_sgd_iter` number of
+                `target_network_update_freq * minibatch_size * num_epochs` number of
                 samples to be trained on by the learner group before updating the target
                 networks and tuned the kl loss coefficients that are used during
                 training.
@@ -292,7 +291,7 @@ def training_step(self) -> ResultDict:
 
         # Update the target network and the KL coefficient for the APPO-loss.
         # The target network update frequency is calculated automatically by the product
-        # of `num_sgd_iter` setting (usually 1 for APPO) and `minibatch_buffer_size`.
+        # of `num_epochs` setting (usually 1 for APPO) and `minibatch_buffer_size`.
         if self.config.enable_rl_module_and_learner:
             if NUM_TARGET_UPDATES in train_results:
                 self._counters[NUM_TARGET_UPDATES] += train_results[NUM_TARGET_UPDATES]
@@ -309,7 +308,7 @@ def training_step(self) -> ResultDict:
                 )
             ]
             target_update_freq = (
-                self.config.num_sgd_iter * self.config.minibatch_buffer_size
+                self.config.num_epochs * self.config.minibatch_buffer_size
             )
             if cur_ts - last_update > target_update_freq:
                 self._counters[NUM_TARGET_UPDATES] += 1
diff --git a/rllib/algorithms/appo/appo_learner.py b/rllib/algorithms/appo/appo_learner.py
index a1c06a854309..ff67637f4257 100644
--- a/rllib/algorithms/appo/appo_learner.py
+++ b/rllib/algorithms/appo/appo_learner.py
@@ -90,14 +90,14 @@ def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None:
             # TODO (avnish) Using steps trained here instead of sampled ... I'm not sure
             #  why the other implementation uses sampled.
             #  The difference in steps sampled/trained is pretty
-            #  much always going to be larger than self.config.num_sgd_iter *
+            #  much always going to be larger than self.config.num_epochs *
             #  self.config.minibatch_buffer_size unless the number of steps collected
             #  is really small. The thing is that the default rollout fragment length
-            #  is 50, so the minibatch buffer size * num_sgd_iter is going to be
+            #  is 50, so the minibatch buffer size * num_epochs is going to be
             #  have to be 50 to even meet the threshold of having delayed target
             #  updates.
             #  We should instead have the target / kl threshold update be based off
-            #  of the train_batch_size * some target update frequency * num_sgd_iter.
+            #  of the train_batch_size * some target update frequency * num_epochs.
 
             last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS)
             if timestep - self.metrics.peek(
diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index a06b9280dbf1..9ad590f72f34 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -134,7 +134,6 @@ def __init__(self, algo_class=None):
         self.vtrace_clip_pg_rho_threshold = 1.0
         self.num_multi_gpu_tower_stacks = 1  # @OldAPIstack
         self.minibatch_buffer_size = 1  # @OldAPIstack
-        self.num_sgd_iter = 1
         self.replay_proportion = 0.0  # @OldAPIstack
         self.replay_buffer_num_slots = 0  # @OldAPIstack
         self.learner_queue_size = 3
@@ -171,7 +170,7 @@ def __init__(self, algo_class=None):
         self.rollout_fragment_length = 50
         self.train_batch_size = 500  # @OldAPIstack
         self.train_batch_size_per_learner = 500
-        self._minibatch_size = "auto"
+        #self._minibatch_size = "auto"
         self.num_env_runners = 2
         self.num_gpus = 1  # @OldAPIstack
         self.lr = 0.0005
@@ -200,8 +199,6 @@ def training(
         num_gpu_loader_threads: Optional[int] = NotProvided,
         num_multi_gpu_tower_stacks: Optional[int] = NotProvided,
         minibatch_buffer_size: Optional[int] = NotProvided,
-        minibatch_size: Optional[Union[int, str]] = NotProvided,
-        num_sgd_iter: Optional[int] = NotProvided,
         replay_proportion: Optional[float] = NotProvided,
         replay_buffer_num_slots: Optional[int] = NotProvided,
         learner_queue_size: Optional[int] = NotProvided,
@@ -252,15 +249,7 @@ def training(
                 - This enables us to preload data into these stacks while another stack
                 is performing gradient calculations.
             minibatch_buffer_size: How many train batches should be retained for
-                minibatching. This conf only has an effect if `num_sgd_iter > 1`.
-            minibatch_size: The size of minibatches that are trained over during
-                each SGD iteration. If "auto", will use the same value as
-                `train_batch_size`.
-                Note that this setting only has an effect if
-                `enable_rl_module_and_learner=True` and it must be a multiple of
-                `rollout_fragment_length` or `sequence_length` and smaller than or equal
-                to `train_batch_size`.
-            num_sgd_iter: Number of passes to make over each train batch.
+                minibatching. This conf only has an effect if `num_epochs > 1`.
             replay_proportion: Set >0 to enable experience replay. Saved samples will
                 be replayed with a p:1 proportion to new data samples.
             replay_buffer_num_slots: Number of sample batches to store for replay.
@@ -330,8 +319,6 @@ def training(
             self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks
         if minibatch_buffer_size is not NotProvided:
             self.minibatch_buffer_size = minibatch_buffer_size
-        if num_sgd_iter is not NotProvided:
-            self.num_sgd_iter = num_sgd_iter
         if replay_proportion is not NotProvided:
             self.replay_proportion = replay_proportion
         if replay_buffer_num_slots is not NotProvided:
@@ -374,8 +361,6 @@ def training(
             self._separate_vf_optimizer = _separate_vf_optimizer
         if _lr_vf is not NotProvided:
             self._lr_vf = _lr_vf
-        if minibatch_size is not NotProvided:
-            self._minibatch_size = minibatch_size
 
         return self
 
@@ -450,21 +435,21 @@ def validate(self) -> None:
                 "config.training(_tf_policy_handles_more_than_one_loss=True)."
             )
         # Learner API specific checks.
-        if (
-            self.enable_rl_module_and_learner
-            and self._minibatch_size != "auto"
-            and not (
-                (self.minibatch_size % self.rollout_fragment_length == 0)
-                and self.minibatch_size <= self.total_train_batch_size
-            )
-        ):
-            raise ValueError(
-                f"`minibatch_size` ({self._minibatch_size}) must either be 'auto' "
-                "or a multiple of `rollout_fragment_length` "
-                f"({self.rollout_fragment_length}) while at the same time smaller "
-                "than or equal to `total_train_batch_size` "
-                f"({self.total_train_batch_size})!"
-            )
+        #if (
+        #    self.enable_rl_module_and_learner
+        #    and self._minibatch_size != "auto"
+        #    and not (
+        #        (self.minibatch_size % self.rollout_fragment_length == 0)
+        #        and self.minibatch_size <= self.total_train_batch_size
+        #    )
+        #):
+        #    raise ValueError(
+        #        f"`minibatch_size` ({self._minibatch_size}) must either be 'auto' "
+        #        "or a multiple of `rollout_fragment_length` "
+        #        f"({self.rollout_fragment_length}) while at the same time smaller "
+        #        "than or equal to `total_train_batch_size` "
+        #        f"({self.total_train_batch_size})!"
+        #    )
 
     @property
     def replay_ratio(self) -> float:
@@ -474,19 +459,19 @@ def replay_ratio(self) -> float:
         """
         return (1 / self.replay_proportion) if self.replay_proportion > 0 else 0.0
 
-    @property
-    def minibatch_size(self):
-        # If 'auto', use the train_batch_size (meaning each SGD iter is a single pass
-        # through the entire train batch). Otherwise, use user provided setting.
-        return (
-            (
-                self.train_batch_size_per_learner
-                if self.enable_env_runner_and_connector_v2
-                else self.train_batch_size
-            )
-            if self._minibatch_size == "auto"
-            else self._minibatch_size
-        )
+    #@property
+    #def minibatch_size(self):
+    #    # If 'auto', use the train_batch_size (meaning each SGD iter is a single pass
+    #    # through the entire train batch). Otherwise, use user provided setting.
+    #    return (
+    #        (
+    #            self.train_batch_size_per_learner
+    #            if self.enable_env_runner_and_connector_v2
+    #            else self.train_batch_size
+    #        )
+    #        if self._minibatch_size == "auto"
+    #        else self._minibatch_size
+    #    )
 
     @override(AlgorithmConfig)
     def get_default_learner_class(self):
@@ -539,7 +524,7 @@ class IMPALA(Algorithm):
     2. If enabled, the replay buffer stores and produces batches of size
        `rollout_fragment_length * num_envs_per_env_runner`.
     3. If enabled, the minibatch ring buffer stores and replays batches of
-       size `train_batch_size` up to `num_sgd_iter` times per batch.
+       size `train_batch_size` up to `num_epochs` times per batch.
     4. The learner thread executes data parallel SGD across `num_gpus` GPUs
        on batches of size `train_batch_size`.
     """
@@ -734,6 +719,9 @@ def training_step(self) -> ResultDict:
                                 NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
                             ),
                         },
+                        num_epochs=self.config.num_epochs,
+                        minibatch_size=self.config.minibatch_size,
+                        shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch,
                     )
                 else:
                     learner_results = self.learner_group.update_from_episodes(
@@ -745,6 +733,9 @@ def training_step(self) -> ResultDict:
                                 NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
                             ),
                         },
+                        num_epochs=self.config.num_epochs,
+                        minibatch_size=self.config.minibatch_size,
+                        shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch,
                     )
                 if not do_async_updates:
                     learner_results = [learner_results]
@@ -1292,7 +1283,7 @@ def _learn_on_processed_samples(self) -> ResultDict:
                     ),
                 },
                 async_update=async_update,
-                num_iters=self.config.num_sgd_iter,
+                num_epochs=self.config.num_epochs,
                 minibatch_size=self.config.minibatch_size,
             )
             if not async_update:
@@ -1531,7 +1522,7 @@ def make_learner_thread(local_worker, config):
             lr=config["lr"],
             train_batch_size=config["train_batch_size"],
             num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"],
-            num_sgd_iter=config["num_sgd_iter"],
+            num_sgd_iter=config["num_epochs"],
             learner_queue_size=config["learner_queue_size"],
             learner_queue_timeout=config["learner_queue_timeout"],
             num_data_load_threads=config["num_gpu_loader_threads"],
@@ -1540,7 +1531,7 @@ def make_learner_thread(local_worker, config):
         learner_thread = LearnerThread(
             local_worker,
             minibatch_buffer_size=config["minibatch_buffer_size"],
-            num_sgd_iter=config["num_sgd_iter"],
+            num_sgd_iter=config["num_epochs"],
             learner_queue_size=config["learner_queue_size"],
             learner_queue_timeout=config["learner_queue_timeout"],
         )
diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py
index 651515666f89..f6f6df0cdb1e 100644
--- a/rllib/algorithms/impala/impala_learner.py
+++ b/rllib/algorithms/impala/impala_learner.py
@@ -93,6 +93,9 @@ def build(self) -> None:
             in_queue=self._learner_thread_in_queue,
             out_queue=self._learner_thread_out_queue,
             metrics_logger=self.metrics,
+            num_epochs=self.config.num_epochs,
+            minibatch_size=self.config.minibatch_size,
+            shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch,
         )
         self._learner_thread.start()
 
@@ -105,8 +108,8 @@ def update_from_episodes(
         # TODO (sven): Deprecate these in favor of config attributes for only those
         #  algos that actually need (and know how) to do minibatching.
         minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
-        num_total_mini_batches: int = 0,
+        num_epochs: int = 1,
+        num_total_minibatches: int = 0,
         reduce_fn=None,  # Deprecated args.
         **kwargs,
     ) -> ResultDict:
@@ -225,7 +228,17 @@ def _step(self) -> None:
 
 
 class _LearnerThread(threading.Thread):
-    def __init__(self, *, update_method, in_queue, out_queue, metrics_logger):
+    def __init__(
+        self,
+        *,
+        update_method,
+        in_queue,
+        out_queue,
+        metrics_logger,
+        num_epochs,
+        minibatch_size,
+        shuffle_batch_per_epoch,
+    ):
         super().__init__()
         self.daemon = True
         self.metrics: MetricsLogger = metrics_logger
@@ -235,6 +248,10 @@ def __init__(self, *, update_method, in_queue, out_queue, metrics_logger):
         self._in_queue: deque = in_queue
         self._out_queue: Queue = out_queue
 
+        self._num_epochs = num_epochs
+        self._minibatch_size = minibatch_size
+        self._shuffle_batch_per_epoch = shuffle_batch_per_epoch
+
     def run(self) -> None:
         while not self.stopped:
             self.step()
@@ -260,6 +277,9 @@ def step(self):
                         NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
                     )
                 },
+                num_epochs=self._num_epochs,
+                minibatch_size=self._minibatch_size,
+                shuffle_batch_per_epoch=self._shuffle_batch_per_epoch,
             )
             # We have to deepcopy the results dict, b/c we must avoid having a returned
             # Stats object sit in the queue and getting a new (possibly even tensor)
diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py
index de9965de8d7d..7dbe8c85566f 100644
--- a/rllib/algorithms/marwil/marwil.py
+++ b/rllib/algorithms/marwil/marwil.py
@@ -398,7 +398,7 @@ class (multi-/single-learner setup) and evaluation on
             learner_results = self.learner_group.update_from_batch(
                 batch,
                 minibatch_size=self.config.train_batch_size_per_learner,
-                num_iters=self.config.dataset_num_iters_per_learner,
+                num_epochs=self.config.dataset_num_iters_per_learner,
             )
 
             # Log training results.
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 2fa7c2d0d589..a627b8df1d1d 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -130,6 +130,7 @@ def __init__(self, algo_class=None):
         self.lr = 5e-5
         self.rollout_fragment_length = "auto"
         self.train_batch_size = 4000
+        self.shuffle_batch_per_epoch = True
 
         # PPO specific settings:
         self.use_critic = True
@@ -138,11 +139,8 @@ def __init__(self, algo_class=None):
         self.use_kl_loss = True
         self.kl_coeff = 0.2
         self.kl_target = 0.01
-        self.sgd_minibatch_size = 128
-        # Simple logic for now: If None, use `train_batch_size`.
-        self.mini_batch_size_per_learner = None
-        self.num_sgd_iter = 30
-        self.shuffle_single_agent_batch = True
+        self.minibatch_size = 128
+        self.num_epochs = 30
         self.vf_loss_coeff = 1.0
         self.entropy_coeff = 0.0
         self.entropy_coeff_schedule = None
@@ -157,6 +155,7 @@ def __init__(self, algo_class=None):
         # fmt: on
 
         # Deprecated keys.
+        self.sgd_minibatch_size = DEPRECATED_VALUE
         self.vf_share_layers = DEPRECATED_VALUE
 
         self.exploration_config = {
@@ -217,10 +216,6 @@ def training(
         use_kl_loss: Optional[bool] = NotProvided,
         kl_coeff: Optional[float] = NotProvided,
         kl_target: Optional[float] = NotProvided,
-        mini_batch_size_per_learner: Optional[int] = NotProvided,
-        sgd_minibatch_size: Optional[int] = NotProvided,
-        num_sgd_iter: Optional[int] = NotProvided,
-        shuffle_single_agent_batch: Optional[bool] = NotProvided,
         vf_loss_coeff: Optional[float] = NotProvided,
         entropy_coeff: Optional[float] = NotProvided,
         entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided,
@@ -244,29 +239,6 @@ def training(
             use_kl_loss: Whether to use the KL-term in the loss function.
             kl_coeff: Initial coefficient for KL divergence.
             kl_target: Target value for KL divergence.
-            mini_batch_size_per_learner: Only use if new API stack is enabled.
-                The mini batch size per Learner worker. This is the
-                batch size that each Learner worker's training batch (whose size is
-                `s`elf.train_batch_size_per_learner`) will be split into. For example,
-                if the train batch size per Learner worker is 4000 and the mini batch
-                size per Learner worker is 400, the train batch will be split into 10
-                equal sized chunks (or "mini batches"). Each such mini batch will be
-                used for one SGD update. Overall, the train batch on each Learner
-                worker will be traversed `self.num_sgd_iter` times. In the above
-                example, if `self.num_sgd_iter` is 5, we will altogether perform 50
-                (10x5) SGD updates per Learner update step.
-            sgd_minibatch_size: Total SGD batch size across all devices for SGD.
-                This defines the minibatch size within each epoch. Deprecated on the
-                new API stack (use `mini_batch_size_per_learner` instead).
-            num_sgd_iter: Number of SGD iterations in each outer loop (i.e., number of
-                epochs to execute per train batch).
-            shuffle_single_agent_batch: Whether to shuffle each single-agent batch once
-                before a new epoch (which consists of n x minibatches, where n is
-                `batch_size_per_learner` // `mini_batch_size_per_learner`). This should
-                be set to True in single-agent and independent multi-agent cases as it
-                ensures proper mixing of the samples before each batch epoch. Otherwise,
-                the sequence of minibatches iterated through is the same in each
-                iteration, possibly impacting learning.
             vf_loss_coeff: Coefficient of the value function loss. IMPORTANT: you must
                 tune this if you set vf_share_layers=True inside your model's config.
             entropy_coeff: The entropy coefficient (float) or entropy coefficient
@@ -301,14 +273,6 @@ def training(
             self.kl_coeff = kl_coeff
         if kl_target is not NotProvided:
             self.kl_target = kl_target
-        if mini_batch_size_per_learner is not NotProvided:
-            self.mini_batch_size_per_learner = mini_batch_size_per_learner
-        if sgd_minibatch_size is not NotProvided:
-            self.sgd_minibatch_size = sgd_minibatch_size
-        if num_sgd_iter is not NotProvided:
-            self.num_sgd_iter = num_sgd_iter
-        if shuffle_single_agent_batch is not NotProvided:
-            self.shuffle_single_agent_batch = shuffle_single_agent_batch
         if vf_loss_coeff is not NotProvided:
             self.vf_loss_coeff = vf_loss_coeff
         if entropy_coeff is not NotProvided:
@@ -342,28 +306,28 @@ def validate(self) -> None:
         self.validate_train_batch_size_vs_rollout_fragment_length()
 
         # SGD minibatch size must be smaller than train_batch_size (b/c
-        # we subsample a batch of `sgd_minibatch_size` from the train-batch for
-        # each `num_sgd_iter`).
+        # we subsample a batch of `minibatch_size` from the train-batch for
+        # each `num_epochs`).
         if (
             not self.enable_rl_module_and_learner
-            and self.sgd_minibatch_size > self.train_batch_size
+            and self.minibatch_size > self.train_batch_size
         ):
             raise ValueError(
-                f"`sgd_minibatch_size` ({self.sgd_minibatch_size}) must be <= "
+                f"`minibatch_size` ({self.minibatch_size}) must be <= "
                 f"`train_batch_size` ({self.train_batch_size}). In PPO, the train batch"
-                f" will be split into {self.sgd_minibatch_size} chunks, each of which "
-                f"is iterated over (used for updating the policy) {self.num_sgd_iter} "
+                f" will be split into {self.minibatch_size} chunks, each of which "
+                f"is iterated over (used for updating the policy) {self.num_epochs} "
                 "times."
             )
         elif self.enable_rl_module_and_learner:
-            mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size
+            mbs = self.minibatch_size
             tbs = self.train_batch_size_per_learner or self.train_batch_size
             if isinstance(mbs, int) and isinstance(tbs, int) and mbs > tbs:
                 raise ValueError(
-                    f"`mini_batch_size_per_learner` ({mbs}) must be <= "
+                    f"`minibatch_size` ({mbs}) must be <= "
                     f"`train_batch_size_per_learner` ({tbs}). In PPO, the train batch"
                     f" will be split into {mbs} chunks, each of which is iterated over "
-                    f"(used for updating the policy) {self.num_sgd_iter} times."
+                    f"(used for updating the policy) {self.num_epochs} times."
                 )
 
         # Episodes may only be truncated (and passed into PPO's
@@ -495,11 +459,8 @@ def _training_step_new_api_stack(self) -> ResultDict:
                         self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME)
                     ),
                 },
-                minibatch_size=(
-                    self.config.mini_batch_size_per_learner
-                    or self.config.sgd_minibatch_size
-                ),
-                num_iters=self.config.num_sgd_iter,
+                minibatch_size=self.config.minibatch_size,
+                num_epochs=self.config.num_epochs,
             )
             self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
             self.metrics.log_dict(
@@ -565,14 +526,10 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict:
 
         # Perform a train step on the collected batch.
         if self.config.enable_rl_module_and_learner:
-            mini_batch_size_per_learner = (
-                self.config.mini_batch_size_per_learner
-                or self.config.sgd_minibatch_size
-            )
             train_results = self.learner_group.update_from_batch(
                 batch=train_batch,
-                minibatch_size=mini_batch_size_per_learner,
-                num_iters=self.config.num_sgd_iter,
+                minibatch_size=self.config.minibatch_size,
+                num_epochs=self.config.num_epochs,
             )
 
         elif self.config.simple_optimizer:
diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py
index c99bc9c8feac..981473e1432b 100644
--- a/rllib/algorithms/ppo/tests/test_ppo.py
+++ b/rllib/algorithms/ppo/tests/test_ppo.py
@@ -126,7 +126,7 @@ def test_ppo_compilation_w_connectors(self):
         config = (
             ppo.PPOConfig()
             .training(
-                num_sgd_iter=2,
+                num_epochs=2,
                 # Setup lr schedule for testing.
                 lr_schedule=[[0, 5e-5], [128, 0.0]],
                 # Set entropy_coeff to a faulty value to proof that it'll get
@@ -199,8 +199,8 @@ def test_ppo_compilation_and_schedule_mixins(self):
                 entropy_coeff=100.0,
                 entropy_coeff_schedule=[[0, 0.1], [512, 0.0]],
                 train_batch_size=256,
-                sgd_minibatch_size=128,
-                num_sgd_iter=2,
+                minibatch_size=128,
+                num_epochs=2,
                 model=dict(
                     # Settings in case we use an LSTM.
                     lstm_cell_size=10,
diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py
index 1794c24bb5ba..5166ceb2d34a 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py
@@ -73,7 +73,7 @@ def test_ppo_compilation_and_schedule_mixins(self):
             )
             .env_runners(num_env_runners=0)
             .training(
-                num_sgd_iter=2,
+                num_epochs=2,
                 # Setup lr schedule for testing lr-scheduling correctness.
                 lr=[[0, 0.00001], [512, 0.0]],  # 512=4x128
                 # Setup `entropy_coeff` schedule for testing whether it's scheduled
diff --git a/rllib/algorithms/ppo/tests/test_repro_ppo.py b/rllib/algorithms/ppo/tests/test_repro_ppo.py
index 7d0fdcfaef2f..50dcd7912d5a 100644
--- a/rllib/algorithms/ppo/tests/test_repro_ppo.py
+++ b/rllib/algorithms/ppo/tests/test_repro_ppo.py
@@ -29,7 +29,7 @@ def test_reproducibility_ppo_cartpole(self):
             ppo.PPOConfig()
             .environment(env="DeterministicCartPole-v1", env_config={"seed": 42})
             .env_runners(rollout_fragment_length=8)
-            .training(train_batch_size=64, sgd_minibatch_size=32, num_sgd_iter=2)
+            .training(train_batch_size=64, minibatch_size=32, num_epochs=2)
         )
         check_reproducibilty(
             algo_class=ppo.PPO,
@@ -47,7 +47,7 @@ def test_reproducibility_ppo_pendulum(self):
             ppo.PPOConfig()
             .environment(env="DeterministicPendulum-v1", env_config={"seed": 42})
             .env_runners(rollout_fragment_length=8)
-            .training(train_batch_size=64, sgd_minibatch_size=32, num_sgd_iter=2)
+            .training(train_batch_size=64, minibatch_size=32, num_epochs=2)
         )
         check_reproducibilty(
             algo_class=ppo.PPO,
diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py
index 97b1cda0c9fe..12c98ce50f60 100644
--- a/rllib/algorithms/tests/test_algorithm.py
+++ b/rllib/algorithms/tests/test_algorithm.py
@@ -54,8 +54,8 @@ def test_add_module_and_remove_module(self):
             .env_runners(num_cpus_per_env_runner=0.1)
             .training(
                 train_batch_size=100,
-                sgd_minibatch_size=50,
-                num_sgd_iter=1,
+                minibatch_size=50,
+                num_epochs=1,
             )
             .rl_module(
                 model_config_dict={
@@ -224,8 +224,8 @@ def test_add_policy_and_remove_policy(self):
             .env_runners(num_cpus_per_env_runner=0.1)
             .training(
                 train_batch_size=100,
-                sgd_minibatch_size=50,
-                num_sgd_iter=1,
+                minibatch_size=50,
+                num_epochs=1,
                 model={
                     "fcnet_hiddens": [5],
                     "fcnet_activation": "linear",
diff --git a/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py b/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py
index f88b54347a84..b4c2a7b1b6ce 100644
--- a/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py
+++ b/rllib/algorithms/tests/test_algorithm_checkpoint_restore.py
@@ -27,7 +27,7 @@
         # See the comment before the `algorithms_and_configs` dict.
         # explore is set to None for PPO in favor of RLModule API support.
         PPOConfig()
-        .training(num_sgd_iter=5, train_batch_size=1000)
+        .training(num_epochs=5, train_batch_size=1000)
         .env_runners(num_env_runners=2)
         .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
         .evaluation(
diff --git a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py
index 1dd50fb84035..7b44191ce0c3 100644
--- a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py
+++ b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py
@@ -54,7 +54,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs):
             .env_runners(rollout_fragment_length=4)
             .learners(**scaling_config)
             .environment(MultiAgentCartPole, env_config={"num_agents": num_agents})
-            .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8)
+            .training(num_epochs=1, train_batch_size=8, minibatch_size=8)
             .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn)
         )
         return config
@@ -190,7 +190,7 @@ def test_e2e_load_rl_module(self):
             .env_runners(rollout_fragment_length=4)
             .learners(**scaling_config)
             .environment("CartPole-v1")
-            .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8)
+            .training(num_epochs=1, train_batch_size=8, minibatch_size=8)
         )
         env = gym.make("CartPole-v1")
         # create a multi_rl_module to load and save it to a checkpoint directory
diff --git a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py
index 3b71c09528bf..19683a89876d 100644
--- a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py
+++ b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py
@@ -10,7 +10,7 @@
 
 
 algorithms_and_configs = {
-    "PPO": (PPOConfig().training(train_batch_size=2, sgd_minibatch_size=2))
+    "PPO": (PPOConfig().training(train_batch_size=2, minibatch_size=2))
 }
 
 
diff --git a/rllib/algorithms/tests/test_callbacks_old_stack.py b/rllib/algorithms/tests/test_callbacks_old_stack.py
index dcbe2e516733..feef340c41ca 100644
--- a/rllib/algorithms/tests/test_callbacks_old_stack.py
+++ b/rllib/algorithms/tests/test_callbacks_old_stack.py
@@ -79,7 +79,7 @@ def test_episode_and_sample_callbacks(self):
             .environment("CartPole-v1")
             .env_runners(num_env_runners=0)
             .callbacks(EpisodeAndSampleCallbacks)
-            .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1)
+            .training(train_batch_size=50, minibatch_size=50, num_epochs=1)
         )
         algo = config.build()
         algo.train()
diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py
index 2b0ca696edf6..42abf7091841 100644
--- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py
+++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py
@@ -106,8 +106,8 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self):
             .callbacks(EpisodeAndSampleCallbacks)
             .training(
                 train_batch_size=50,  # <- rollout_fragment_length=50
-                sgd_minibatch_size=50,
-                num_sgd_iter=1,
+                minibatch_size=50,
+                num_epochs=1,
             )
         )
 
@@ -158,8 +158,8 @@ def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self):
             .callbacks(EpisodeAndSampleCallbacks)
             .training(
                 train_batch_size=50,  # <- rollout_fragment_length=50
-                sgd_minibatch_size=50,
-                num_sgd_iter=1,
+                minibatch_size=50,
+                num_epochs=1,
             )
         )
 
diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py
index 8ae1a2d69102..8e603694a158 100644
--- a/rllib/algorithms/tests/test_worker_failures.py
+++ b/rllib/algorithms/tests/test_worker_failures.py
@@ -452,8 +452,8 @@ def test_multi_gpu(self):
             .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker)
             .training(
                 train_batch_size=10,
-                sgd_minibatch_size=1,
-                num_sgd_iter=1,
+                minibatch_size=1,
+                num_epochs=1,
             )
         )
 
@@ -561,7 +561,7 @@ def test_workers_failing_recover(self):
             )
             .training(
                 train_batch_size_per_learner=32,
-                sgd_minibatch_size=32,
+                minibatch_size=32,
             )
             .environment(
                 env="fault_env",
@@ -620,7 +620,7 @@ def test_modules_are_restored_on_recovered_worker(self):
             )
             .training(
                 train_batch_size_per_learner=32,
-                sgd_minibatch_size=32,
+                minibatch_size=32,
             )
             .environment(
                 env="multi_agent_fault_env",
@@ -729,7 +729,7 @@ def test_eval_workers_failing_recover(self):
             )
             .training(
                 train_batch_size_per_learner=32,
-                sgd_minibatch_size=32,
+                minibatch_size=32,
             )
             .environment(env="fault_env")
             .evaluation(
diff --git a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py
index a941f66deff1..fa046b05285d 100644
--- a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py
+++ b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py
@@ -43,8 +43,8 @@ def main(pargs):
             vf_clip_param=10.0,
             entropy_coeff=0.01,
             train_batch_size=32 if pargs.smoke_test else 16000,
-            sgd_minibatch_size=1 if pargs.smoke_test else 2000,
-            num_sgd_iter=1 if pargs.smoke_test else 10,
+            minibatch_size=1 if pargs.smoke_test else 2000,
+            num_epochs=1 if pargs.smoke_test else 10,
             vf_loss_coeff=0.01,
             clip_param=0.1,
             lr=0.0001,
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index 98b15a5cab18..14590f4b9e94 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -866,7 +866,7 @@ def compute_losses(
             fwd_out: Output from a call to the `forward_train()` method of the
                 underlying MultiRLModule (`self.module`) during training
                 (`self.update()`).
-            batch: The training batch that was used to compute `fwd_out`.
+            batch: The train batch that was used to compute `fwd_out`.
 
         Returns:
             A dictionary mapping module IDs to individual loss terms.
@@ -905,7 +905,7 @@ def compute_loss_for_module(
         Args:
             module_id: The id of the module.
             config: The AlgorithmConfig specific to the given `module_id`.
-            batch: The sample batch for this particular module.
+            batch: The train batch for this particular module.
             fwd_out: The output of the forward pass for this particular module.
 
         Returns:
@@ -925,17 +925,15 @@ def update_from_batch(
         *,
         # TODO (sven): Make this a more formal structure with its own type.
         timesteps: Optional[Dict[str, Any]] = None,
-        # TODO (sven): Deprecate these in favor of config attributes for only those
-        #  algos that actually need (and know how) to do minibatching.
+        num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
         # Deprecated args.
-        reduce_fn=DEPRECATED_VALUE,
+        num_iters=DEPRECATED_VALUE,
     ) -> ResultDict:
-        """Do `num_iters` minibatch updates given a train batch.
+        """Run `num_epochs` epochs over the given train batch.
 
         You can use this method to take more than one backward pass on the batch.
-        The same `minibatch_size` and `num_iters` will be used for all module ids in
+        The same `minibatch_size` and `num_epochs` will be used for all module ids in
         MultiRLModule.
 
         Args:
@@ -943,9 +941,12 @@ def update_from_batch(
             timesteps: Timesteps dict, which must have the key
                 `NUM_ENV_STEPS_SAMPLED_LIFETIME`.
                 # TODO (sven): Make this a more formal structure with its own type.
-            minibatch_size: The size of the minibatch to use for each update.
-            num_iters: The number of complete passes over all the sub-batches
-                in the input multi-agent batch.
+            num_epochs: The number of complete passes over the entire train batch. Each
+                pass might be further split into n minibatches (if `minibatch_size`
+                provided). The train batch is generated from the given `episodes`
+                through the Learner connector pipeline.
+            minibatch_size: The size of minibatches to use to further split the train
+                batch into.
 
         Returns:
             A `ResultDict` object produced by a call to `self.metrics.reduce()`. The
@@ -954,21 +955,17 @@ def update_from_batch(
             Learner) to further reduce these results (for example over n parallel
             Learners).
         """
-        if reduce_fn != DEPRECATED_VALUE:
+        if num_iters != DEPRECATED_VALUE:
             deprecation_warning(
-                old="Learner.update_from_batch(reduce_fn=..)",
-                new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., "
-                "reduce=[mean|min|max|sum], window=..., ema_coeff=...)",
-                help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger"
-                " API in your custom Learner methods for logging your custom values "
-                "and time-reducing (or parallel-reducing) them.",
+                old="Learner.update_from_episodes(num_iters=...)",
+                new="Learner.update_from_episodes(num_epochs=...)",
                 error=True,
             )
         return self._update_from_batch_or_episodes(
             batch=batch,
             timesteps=timesteps,
+            num_epochs=num_epochs,
             minibatch_size=minibatch_size,
-            num_iters=num_iters,
         )
 
     def update_from_episodes(
@@ -977,18 +974,16 @@ def update_from_episodes(
         *,
         # TODO (sven): Make this a more formal structure with its own type.
         timesteps: Optional[Dict[str, Any]] = None,
-        # TODO (sven): Deprecate these in favor of config attributes for only those
-        #  algos that actually need (and know how) to do minibatching.
+        num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
-        num_total_mini_batches: int = 0,
+        num_total_minibatches: int = 0,
         # Deprecated args.
-        reduce_fn=DEPRECATED_VALUE,
+        num_iters=DEPRECATED_VALUE,
     ) -> ResultDict:
-        """Do `num_iters` minibatch updates given a list of episodes.
+        """Run `num_epochs` epochs over the train batch generated from `episodes`.
 
         You can use this method to take more than one backward pass on the batch.
-        The same `minibatch_size` and `num_iters` will be used for all module ids in
+        The same `minibatch_size` and `num_epochs` will be used for all module ids in
         MultiRLModule.
 
         Args:
@@ -996,17 +991,20 @@ def update_from_episodes(
             timesteps: Timesteps dict, which must have the key
                 `NUM_ENV_STEPS_SAMPLED_LIFETIME`.
                 # TODO (sven): Make this a more formal structure with its own type.
-            minibatch_size: The size of the minibatch to use for each update.
-            num_iters: The number of complete passes over all the sub-batches
-                in the input multi-agent batch.
-            num_total_mini_batches: The total number of mini-batches to loop through
-                (across all `num_sgd_iter` SGD iterations). It's required to set this
-                for multi-agent + multi-GPU situations in which the MultiAgentEpisodes
+            num_epochs: The number of complete passes over the entire train batch. Each
+                pass might be further split into n minibatches (if `minibatch_size`
+                provided). The train batch is generated from the given `episodes`
+                through the Learner connector pipeline.
+            minibatch_size: The size of minibatches to use to further split the train
+                batch into. The train batch is generated from the given `episodes`
+                through the Learner connector pipeline.
+            num_total_minibatches: The total number of minibatches to loop through
+                (over all `num_epochs` epochs). It's only required to set this to != 0
+                in multi-agent + multi-GPU situations, in which the MultiAgentEpisodes
                 themselves are roughly sharded equally, however, they might contain
                 SingleAgentEpisodes with very lopsided length distributions. Thus,
-                without this fixed, pre-computed value it can happen that one Learner
-                goes through a different number of mini-batches than other Learners,
-                causing a deadlock.
+                without this fixed, pre-computed value, one Learner might go through a
+                different number of minibatche passes than others causing a deadlock.
 
         Returns:
             A `ResultDict` object produced by a call to `self.metrics.reduce()`. The
@@ -1015,22 +1013,18 @@ def update_from_episodes(
             Learner) to further reduce these results (for example over n parallel
             Learners).
         """
-        if reduce_fn != DEPRECATED_VALUE:
+        if num_iters != DEPRECATED_VALUE:
             deprecation_warning(
-                old="Learner.update_from_episodes(reduce_fn=..)",
-                new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., "
-                "reduce=[mean|min|max|sum], window=..., ema_coeff=...)",
-                help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger"
-                " API in your custom Learner methods for logging your custom values "
-                "and time-reducing (or parallel-reducing) them.",
+                old="Learner.update_from_episodes(num_iters=...)",
+                new="Learner.update_from_episodes(num_epochs=...)",
                 error=True,
             )
         return self._update_from_batch_or_episodes(
             episodes=episodes,
             timesteps=timesteps,
             minibatch_size=minibatch_size,
-            num_iters=num_iters,
-            num_total_mini_batches=num_total_mini_batches,
+            num_epochs=num_epochs,
+            num_total_minibatches=num_total_minibatches,
         )
 
     def update_from_iterator(
@@ -1043,7 +1037,7 @@ def update_from_iterator(
         **kwargs,
     ):
         self._check_is_built()
-        minibatch_size = minibatch_size or 32
+        #minibatch_size = minibatch_size or 32
 
         # Call `before_gradient_based_update` to allow for non-gradient based
         # preparations-, logging-, and update logic to happen.
@@ -1228,8 +1222,9 @@ def _update_from_batch_or_episodes(
         # TODO (sven): Deprecate these in favor of config attributes for only those
         #  algos that actually need (and know how) to do minibatching.
         minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
-        num_total_mini_batches: int = 0,
+        num_epochs: int = 1,
+        shuffle_batch_per_epoch: bool = True,
+        num_total_minibatches: int = 0,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
 
         self._check_is_built()
@@ -1296,17 +1291,12 @@ def _update_from_batch_or_episodes(
 
         if minibatch_size:
             if self._learner_connector is not None:
-                batch_iter = partial(
-                    MiniBatchCyclicIterator,
-                    uses_new_env_runners=True,
-                    num_total_mini_batches=num_total_mini_batches,
-                    shuffle=self.config.shuffle_single_agent_batch,
-                )
+                batch_iter = partial(MiniBatchCyclicIterator, _uses_new_env_runners=True)
             else:
                 batch_iter = MiniBatchCyclicIterator
-        elif num_iters > 1:
-            # `minibatch_size` was not set but `num_iters` > 1.
-            # Under the old training stack, users could do multiple sgd passes
+        elif num_epochs > 1:
+            # `minibatch_size` was not set but `num_epochs` > 1.
+            # Under the old training stack, users could do multiple epochs
             # over a batch without specifying a minibatch size. We enable
             # this behavior here by setting the minibatch size to be the size
             # of the batch (e.g. 1 minibatch of size batch.count)
@@ -1314,7 +1304,7 @@ def _update_from_batch_or_episodes(
             # Note that there is no need to shuffle here, b/c we don't have minibatches.
             batch_iter = MiniBatchCyclicIterator
         else:
-            # `minibatch_size` and `num_iters` are not set by the user.
+            # `minibatch_size` and `num_epochs` are not set by the user.
             batch_iter = MiniBatchDummyIterator
 
         # Convert input batch into a tensor batch (MultiAgentBatch) on the correct
@@ -1324,7 +1314,13 @@ def _update_from_batch_or_episodes(
             batch = self._convert_batch_type(batch)
         batch = self._set_slicing_by_batch_id(batch, value=True)
 
-        for tensor_minibatch in batch_iter(batch, minibatch_size, num_iters):
+        for tensor_minibatch in batch_iter(
+            batch,
+            minibatch_size=minibatch_size,
+            num_epochs=num_epochs,
+            shuffle_batch_per_epoch=shuffle_batch_per_epoch and (num_epochs > 1),
+            num_total_minibatches=num_total_minibatches,
+        ):
             # Make the actual in-graph/traced `_update` call. This should return
             # all tensor values (no numpy).
             fwd_out, loss_per_module, tensor_metrics = self._update(
diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py
index 273af2352031..d746265c9b23 100644
--- a/rllib/core/learner/learner_group.py
+++ b/rllib/core/learner/learner_group.py
@@ -221,13 +221,9 @@ def update_from_batch(
         timesteps: Optional[Dict[str, Any]] = None,
         async_update: bool = False,
         return_state: bool = False,
-        # TODO (sven): Deprecate the following args. They should be extracted from the
-        #  self.config of those specific algorithms that actually require these
-        #  settings.
+        num_epochs: int = 1,
+        shuffle_batch_per_epoch: bool = False,
         minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
-        # Already deprecated args.
-        reduce_fn=DEPRECATED_VALUE,
         # User kwargs.
         **kwargs,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
@@ -261,24 +257,13 @@ def update_from_batch(
             results are reduced, a list of dictionaries of the reduced results from each
             call to async_update that is ready.
         """
-        if reduce_fn != DEPRECATED_VALUE:
-            deprecation_warning(
-                old="LearnerGroup.update_from_batch(reduce_fn=..)",
-                new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., "
-                "reduce=[mean|min|max|sum], window=..., ema_coeff=...)",
-                help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger"
-                " API in your custom Learner methods for logging and time-reducing any "
-                "custom metrics. The central `MetricsLogger` instance is available "
-                "under `self.metrics` within your custom Learner.",
-                error=True,
-            )
         return self._update(
             batch=batch,
             timesteps=timesteps,
             async_update=async_update,
             return_state=return_state,
             minibatch_size=minibatch_size,
-            num_iters=num_iters,
+            num_epochs=num_epochs,
             **kwargs,
         )
 
@@ -289,13 +274,9 @@ def update_from_episodes(
         timesteps: Optional[Dict[str, Any]] = None,
         async_update: bool = False,
         return_state: bool = False,
-        # TODO (sven): Deprecate the following args. They should be extracted from the
-        #  self.config of those specific algorithms that actually require these
-        #  settings.
+        num_epochs: int = 1,
+        shuffle_batch_per_epoch: bool = False,
         minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
-        # Already deprecated args.
-        reduce_fn=DEPRECATED_VALUE,
         # User kwargs.
         **kwargs,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
@@ -329,25 +310,13 @@ def update_from_episodes(
             results are reduced, a list of dictionaries of the reduced results from each
             call to async_update that is ready.
         """
-        if reduce_fn != DEPRECATED_VALUE:
-            deprecation_warning(
-                old="LearnerGroup.update_from_episodes(reduce_fn=..)",
-                new="Learner.metrics.[log_value|log_dict|log_time](key=..., value=..., "
-                "reduce=[mean|min|max|sum], window=..., ema_coeff=...)",
-                help="Use the new ray.rllib.utils.metrics.metrics_logger::MetricsLogger"
-                " API in your custom Learner methods for logging and time-reducing any "
-                "custom metrics. The central `MetricsLogger` instance is available "
-                "under `self.metrics` within your custom Learner.",
-                error=True,
-            )
-
         return self._update(
             episodes=episodes,
             timesteps=timesteps,
             async_update=async_update,
             return_state=return_state,
             minibatch_size=minibatch_size,
-            num_iters=num_iters,
+            num_epochs=num_epochs,
             **kwargs,
         )
 
@@ -359,11 +328,17 @@ def _update(
         timesteps: Optional[Dict[str, Any]] = None,
         async_update: bool = False,
         return_state: bool = False,
+        num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
+        shuffle_batch_per_epoch: bool = False,
+        # Deprecated args.
+        num_iters=DEPRECATED_VALUE,
         **kwargs,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
 
+        if num_iters != DEPRECATED_VALUE:
+            deprecation_warning(old="num_iters", new="num_epochs", error=True)
+
         # Define function to be called on all Learner actors (or the local learner).
         def _learner_update(
             _learner: Learner,
@@ -372,7 +347,7 @@ def _learner_update(
             _episodes_shard=None,
             _timesteps=None,
             _return_state=False,
-            _num_total_mini_batches=0,
+            _num_total_minibatches=0,
             **_kwargs,
         ):
             # If the batch shard is an `DataIterator` we have an offline
@@ -383,7 +358,7 @@ def _learner_update(
                     iterator=_batch_shard,
                     timesteps=_timesteps,
                     minibatch_size=minibatch_size,
-                    num_iters=num_iters,
+                    num_epochs=num_epochs,
                     **_kwargs,
                 )
             elif _batch_shard is not None:
@@ -391,7 +366,7 @@ def _learner_update(
                     batch=_batch_shard,
                     timesteps=_timesteps,
                     minibatch_size=minibatch_size,
-                    num_iters=num_iters,
+                    num_epochs=num_epochs,
                     **_kwargs,
                 )
             else:
@@ -399,8 +374,8 @@ def _learner_update(
                     episodes=_episodes_shard,
                     timesteps=_timesteps,
                     minibatch_size=minibatch_size,
-                    num_iters=num_iters,
-                    num_total_mini_batches=_num_total_mini_batches,
+                    num_epochs=num_epochs,
+                    num_total_minibatches=_num_total_minibatches,
                     **_kwargs,
                 )
             if _return_state:
@@ -485,13 +460,13 @@ def _learner_update(
                 from ray.data.iterator import DataIterator
 
                 if isinstance(episodes[0], DataIterator):
-                    num_total_mini_batches = 0
+                    num_total_minibatches = 0
                     partials = [
                         partial(
                             _learner_update,
                             _episodes_shard=episodes_shard,
                             _timesteps=timesteps,
-                            _num_total_mini_batches=num_total_mini_batches,
+                            _num_total_minibatches=num_total_minibatches,
                         )
                         for episodes_shard in episodes
                     ]
@@ -506,20 +481,20 @@ def _learner_update(
                     # In the multi-agent case AND `minibatch_size` AND num_workers
                     # > 1, we compute a max iteration counter such that the different
                     # Learners will not go through a different number of iterations.
-                    num_total_mini_batches = 0
+                    num_total_minibatches = 0
                     if minibatch_size and len(self._workers) > 1:
-                        num_total_mini_batches = self._compute_num_total_mini_batches(
+                        num_total_minibatches = self._compute_num_total_minibatches(
                             episodes,
                             len(self._workers),
                             minibatch_size,
-                            num_iters,
+                            num_epochs,
                         )
                     partials = [
                         partial(
                             _learner_update,
                             _episodes_shard=eps_shard,
                             _timesteps=timesteps,
-                            _num_total_mini_batches=num_total_mini_batches,
+                            _num_total_minibatches=num_total_minibatches,
                         )
                         for eps_shard in eps_shards
                     ]
@@ -934,11 +909,11 @@ def __del__(self):
             self.shutdown()
 
     @staticmethod
-    def _compute_num_total_mini_batches(
+    def _compute_num_total_minibatches(
         episodes,
         num_shards,
-        mini_batch_size,
-        num_iters,
+        minibatch_size,
+        num_epochs,
     ):
         # Count total number of timesteps per module ID.
         if isinstance(episodes[0], MultiAgentEpisode):
@@ -950,7 +925,7 @@ def _compute_num_total_mini_batches(
         else:
             max_ts = sum(map(len, episodes))
 
-        return int((num_iters * max_ts) / (num_shards * mini_batch_size))
+        return int((num_epochs * max_ts) / (num_shards * minibatch_size))
 
     @Deprecated(new="LearnerGroup.update_from_batch(async=False)", error=False)
     def update(self, *args, **kwargs):
diff --git a/rllib/env/tests/test_multi_agent_env.py b/rllib/env/tests/test_multi_agent_env.py
index 2646c24c41ac..707effbaa4ce 100644
--- a/rllib/env/tests/test_multi_agent_env.py
+++ b/rllib/env/tests/test_multi_agent_env.py
@@ -598,7 +598,7 @@ def test_multi_agent_with_flex_agents(self):
             .environment("flex_agents_multi_agent")
             .env_runners(num_env_runners=0)
             .framework("tf")
-            .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1)
+            .training(train_batch_size=50, minibatch_size=50, num_epochs=1)
         )
         algo = config.build()
         for i in range(10):
@@ -863,7 +863,7 @@ def gen_policy():
                 ),
             )
             .framework("tf")
-            .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1)
+            .training(train_batch_size=50, minibatch_size=50, num_epochs=1)
         )
 
         algo = config.build()
diff --git a/rllib/evaluation/tests/test_envs_that_crash.py b/rllib/evaluation/tests/test_envs_that_crash.py
index 573925b35d6b..cef94ecbd7dd 100644
--- a/rllib/evaluation/tests/test_envs_that_crash.py
+++ b/rllib/evaluation/tests/test_envs_that_crash.py
@@ -109,7 +109,7 @@ def test_env_crash_on_one_worker_during_sampling_but_recreate_worker(self):
                 recreate_failed_env_runners=True,
                 delay_between_env_runner_restarts_s=0,
             )
-            .training(train_batch_size=60, sgd_minibatch_size=60)
+            .training(train_batch_size=60, minibatch_size=60)
             .environment(
                 env=CartPoleCrashing,
                 env_config={
diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py
index 4f8ed097170c..145f4695f849 100644
--- a/rllib/evaluation/tests/test_rollout_worker.py
+++ b/rllib/evaluation/tests/test_rollout_worker.py
@@ -208,7 +208,7 @@ def test_query_evaluators(self):
                 num_envs_per_env_runner=2,
                 create_env_on_local_worker=True,
             )
-            .training(train_batch_size=20, sgd_minibatch_size=5, num_sgd_iter=1)
+            .training(train_batch_size=20, minibatch_size=5, num_epochs=1)
         )
         algo = config.build()
         results = algo.env_runner_group.foreach_worker(
diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py
index 457abba37f63..dab76f73cf56 100644
--- a/rllib/evaluation/tests/test_trajectory_view_api.py
+++ b/rllib/evaluation/tests/test_trajectory_view_api.py
@@ -290,7 +290,7 @@ def test_counting_by_agent_steps(self):
             # Env setup.
             .environment(MultiAgentPendulum, env_config={"num_agents": num_agents})
             .env_runners(num_env_runners=2, rollout_fragment_length=21)
-            .training(num_sgd_iter=2, train_batch_size=168)
+            .training(num_epochs=2, train_batch_size=168)
             .framework("torch")
             .multi_agent(
                 policies={f"p{i}" for i in range(num_agents)},
diff --git a/rllib/examples/actions/nested_action_spaces.py b/rllib/examples/actions/nested_action_spaces.py
index db7ad434c674..bb8c3dbf4e71 100644
--- a/rllib/examples/actions/nested_action_spaces.py
+++ b/rllib/examples/actions/nested_action_spaces.py
@@ -84,7 +84,7 @@ def _env_to_module_pipeline(env):
         base_config.training(
             # We don't want high entropy in this Env.
             entropy_coeff=0.00005,
-            num_sgd_iter=4,
+            num_epochs=4,
             vf_loss_coeff=0.01,
         )
 
diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
index 95dc0ae26c24..2f31c3e95297 100644
--- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
+++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
@@ -178,7 +178,7 @@ def training_step(self) -> ResultDict:
             None,
             # Provide entire AlgorithmConfig object, not just an override.
             PPOConfig()
-            .training(num_sgd_iter=10, sgd_minibatch_size=128)
+            .training(num_epochs=10, minibatch_size=128)
             .framework("torch" if args.torch or args.mixed_torch_tf else "tf"),
         ),
         "dqn_policy": (
diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py
index a154a73f088a..c7454161ab06 100644
--- a/rllib/examples/cartpole_lstm.py
+++ b/rllib/examples/cartpole_lstm.py
@@ -67,7 +67,7 @@
     )
 
     if args.run == "PPO":
-        config.training(num_sgd_iter=5, vf_loss_coeff=0.0001, train_batch_size=512)
+        config.training(num_epochs=5, vf_loss_coeff=0.0001, train_batch_size=512)
         config.model["vf_share_layers"] = True
     elif args.run == "IMPALA":
         config.env_runners(num_env_runners=2)
diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py
index 119d9f6442ef..93d85bcd7633 100644
--- a/rllib/examples/catalogs/mobilenet_v2_encoder.py
+++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py
@@ -53,7 +53,7 @@ def _get_encoder_config(
     # The following training settings make it so that a training iteration is very
     # quick. This is just for the sake of this example. PPO will not learn properly
     # with these settings!
-    .training(train_batch_size=32, sgd_minibatch_size=16, num_sgd_iter=1)
+    .training(train_batch_size=32, minibatch_size=16, num_epochs=1)
 )
 
 # CartPole's observation space is not compatible with our MobileNetV2 Encoder, so
diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
index e38b46309dc1..9da050fe0b62 100644
--- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
+++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
@@ -96,7 +96,7 @@
         .environment("env")
         .training(
             train_batch_size_per_learner=512,
-            mini_batch_size_per_learner=64,
+            minibatch_size=64,
             lambda_=0.1,
             gamma=0.95,
             lr=0.0003,
diff --git a/rllib/examples/connectors/flatten_observations_dict_space.py b/rllib/examples/connectors/flatten_observations_dict_space.py
index 6958c9c27cd2..a6af2f16c19e 100644
--- a/rllib/examples/connectors/flatten_observations_dict_space.py
+++ b/rllib/examples/connectors/flatten_observations_dict_space.py
@@ -142,7 +142,7 @@ def _env_to_module_pipeline(env):
     # PPO-specific settings (for better learning behavior only).
     if args.algo == "PPO":
         base_config.training(
-            num_sgd_iter=6,
+            num_epochs=6,
             vf_loss_coeff=0.01,
         )
     # IMPALA-specific settings (for better learning behavior only).
diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py
index 52a4f4c352b1..f0dcea0b9c43 100644
--- a/rllib/examples/connectors/frame_stacking.py
+++ b/rllib/examples/connectors/frame_stacking.py
@@ -212,8 +212,8 @@ def _env_creator(cfg):
     # PPO specific settings.
     if args.algo == "PPO":
         base_config.training(
-            num_sgd_iter=10,
-            mini_batch_size_per_learner=64,
+            num_epochs=10,
+            minibatch_size=64,
             lambda_=0.95,
             kl_coeff=0.5,
             clip_param=0.1,
diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py
index 2fec8f3c63d0..75c373333677 100644
--- a/rllib/examples/connectors/mean_std_filtering.py
+++ b/rllib/examples/connectors/mean_std_filtering.py
@@ -183,7 +183,7 @@ def observation(self, observation):
     # PPO specific settings.
     if args.algo == "PPO":
         base_config.training(
-            mini_batch_size_per_learner=64,
+            minibatch_size=64,
             lambda_=0.1,
             vf_clip_param=10.0,
         )
diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py
index dcee6ac5689e..a7b0bc056218 100644
--- a/rllib/examples/connectors/prev_actions_prev_rewards.py
+++ b/rllib/examples/connectors/prev_actions_prev_rewards.py
@@ -141,7 +141,7 @@ def _env_to_module(env):
         .environment("env")
         .env_runners(env_to_module_connector=_env_to_module)
         .training(
-            num_sgd_iter=6,
+            num_epochs=6,
             lr=0.0003,
             train_batch_size=4000,
             vf_loss_coeff=0.01,
diff --git a/rllib/examples/curiosity/count_based_curiosity.py b/rllib/examples/curiosity/count_based_curiosity.py
index 90f69a513ac9..7b9b4b83d500 100644
--- a/rllib/examples/curiosity/count_based_curiosity.py
+++ b/rllib/examples/curiosity/count_based_curiosity.py
@@ -127,7 +127,7 @@
             learner_connector=(
                 None if args.no_curiosity else lambda *ags, **kw: CountBasedCuriosity()
             ),
-            num_sgd_iter=10,
+            num_epochs=10,
             vf_loss_coeff=0.01,
         )
         .rl_module(model_config_dict={"vf_share_layers": True})
diff --git a/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py b/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py
index 9aab5a31a4ad..5809c3c9a420 100644
--- a/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py
+++ b/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py
@@ -270,7 +270,7 @@ def on_sample_end(
     # Set PPO-specific hyper-parameters.
     if args.algo == "PPO":
         base_config.training(
-            num_sgd_iter=6,
+            num_epochs=6,
             # Plug in the correct Learner class.
             learner_class=PPOTorchLearnerWithCuriosity,
             train_batch_size_per_learner=2000,
diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py
index a6f0e9fb2d26..7a7cd6cc1d41 100644
--- a/rllib/examples/curriculum/curriculum_learning.py
+++ b/rllib/examples/curriculum/curriculum_learning.py
@@ -218,7 +218,7 @@ def on_train_result(
             env_to_module_connector=lambda env: FlattenObservations(),
         )
         .training(
-            num_sgd_iter=6,
+            num_epochs=6,
             vf_loss_coeff=0.01,
             lr=0.0002,
         )
diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py
index 6bbf538e025c..5ef6ee1a0167 100644
--- a/rllib/examples/debugging/deterministic_training.py
+++ b/rllib/examples/debugging/deterministic_training.py
@@ -65,7 +65,7 @@
 
     if args.run == "PPO":
         # Simplify to run this example script faster.
-        config.training(sgd_minibatch_size=10, num_sgd_iter=5)
+        config.training(minibatch_size=10, num_epochs=5)
 
     stop = {TRAINING_ITERATION: args.stop_iters}
 
diff --git a/rllib/examples/envs/env_rendering_and_recording.py b/rllib/examples/envs/env_rendering_and_recording.py
index d910ac92fc57..903056288f4e 100644
--- a/rllib/examples/envs/env_rendering_and_recording.py
+++ b/rllib/examples/envs/env_rendering_and_recording.py
@@ -264,7 +264,7 @@ def _env_creator(cfg):
             clip_param=0.1,
             vf_clip_param=10.0,
             entropy_coeff=0.01,
-            num_sgd_iter=10,
+            num_epochs=10,
             # Linearly adjust learning rate based on number of GPUs.
             lr=0.00015 * (args.num_gpus or 1),
             grad_clip=100.0,
diff --git a/rllib/examples/envs/greyscale_env.py b/rllib/examples/envs/greyscale_env.py
index 5af971ad23fb..2f0e5ffc9560 100644
--- a/rllib/examples/envs/greyscale_env.py
+++ b/rllib/examples/envs/greyscale_env.py
@@ -101,11 +101,11 @@ def env_creator(config):
         vf_loss_coeff=0.1,
         clip_param=0.1,
         vf_clip_param=10.0,
-        num_sgd_iter=10,
+        num_epochs=10,
         kl_coeff=0.5,
         lr=0.0001,
         grad_clip=100,
-        sgd_minibatch_size=500,
+        minibatch_size=500,
         train_batch_size=5000 if not args.as_test else 1000,
         model={"vf_share_layers": True},
     )
diff --git a/rllib/examples/envs/unity3d_env_local.py b/rllib/examples/envs/unity3d_env_local.py
index 40350a8c5853..d334125ee4e8 100644
--- a/rllib/examples/envs/unity3d_env_local.py
+++ b/rllib/examples/envs/unity3d_env_local.py
@@ -145,9 +145,9 @@
             lr=0.0003,
             lambda_=0.95,
             gamma=0.99,
-            sgd_minibatch_size=256,
+            minibatch_size=256,
             train_batch_size=4000,
-            num_sgd_iter=20,
+            num_epochs=20,
             clip_param=0.2,
             model={"fcnet_hiddens": [512, 512]},
         )
diff --git a/rllib/examples/gpus/float16_training_and_inference.py b/rllib/examples/gpus/float16_training_and_inference.py
index 169481b849bb..aa498663b2d6 100644
--- a/rllib/examples/gpus/float16_training_and_inference.py
+++ b/rllib/examples/gpus/float16_training_and_inference.py
@@ -249,7 +249,7 @@ def configure_optimizers_for_module(self, module_id, config):
             # Typical CartPole-v1 hyperparams known to work well:
             gamma=0.99,
             lr=0.0003,
-            num_sgd_iter=6,
+            num_epochs=6,
             vf_loss_coeff=0.01,
             use_kl_loss=True,
         )
diff --git a/rllib/examples/gpus/mixed_precision_training_float16_inference.py b/rllib/examples/gpus/mixed_precision_training_float16_inference.py
index 56d4fb171208..e27dd8b7b579 100644
--- a/rllib/examples/gpus/mixed_precision_training_float16_inference.py
+++ b/rllib/examples/gpus/mixed_precision_training_float16_inference.py
@@ -169,7 +169,7 @@ def _update(self, *args, **kwargs):
             # Typical CartPole-v1 hyperparams known to work well:
             gamma=0.99,
             lr=0.0003,
-            num_sgd_iter=6,
+            num_epochs=6,
             vf_loss_coeff=0.01,
             use_kl_loss=True,
         )
diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py
index 0e092680b390..8391ca272704 100644
--- a/rllib/examples/inference/policy_inference_after_training_w_connector.py
+++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py
@@ -151,7 +151,7 @@ def _env_creator(cfg):
         get_trainable_cls(args.algo)
         .get_default_config()
         .training(
-            num_sgd_iter=6,
+            num_epochs=6,
             lr=0.0003,
             vf_loss_coeff=0.01,
         )
diff --git a/rllib/examples/learners/custom_loss_fn_simple.py b/rllib/examples/learners/custom_loss_fn_simple.py
index 2cf94790c94a..151406330502 100644
--- a/rllib/examples/learners/custom_loss_fn_simple.py
+++ b/rllib/examples/learners/custom_loss_fn_simple.py
@@ -128,7 +128,7 @@ class for details on how to override the main (PPO) loss function.
             # `self.config.learner_config_dict['regularizer_coeff']`
             learner_config_dict={"regularizer_coeff": args.regularizer_coeff},
             # Some settings to make this example learn better.
-            num_sgd_iter=6,
+            num_epochs=6,
             vf_loss_coeff=0.01,
             # The learning rate, settable through the command line `--lr` arg.
             lr=args.lr,
diff --git a/rllib/examples/learners/separate_vf_lr_and_optimizer.py b/rllib/examples/learners/separate_vf_lr_and_optimizer.py
index b8d21db87f13..93f03e6101c5 100644
--- a/rllib/examples/learners/separate_vf_lr_and_optimizer.py
+++ b/rllib/examples/learners/separate_vf_lr_and_optimizer.py
@@ -117,7 +117,7 @@ class for details on how to override the main (torch) `configure_optimizers_for_
             # `self.config.learner_config_dict['lr_vf']`
             learner_config_dict={"lr_vf": args.lr_vf},
             # Some settings to make this example learn better.
-            num_sgd_iter=6,
+            num_epochs=6,
             # Since we are using separate optimizers for the two NN components, the
             # value of `vf_loss_coeff` does not matter anymore. We set this to 1.0 here.
             vf_loss_coeff=1.0,
diff --git a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py
index 2a5a2baae730..d12ccd3eedbf 100644
--- a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py
+++ b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py
@@ -125,7 +125,7 @@ def train_ppo_agent_from_checkpointed_module(
         .training(
             lr=0.0001,
             gamma=0.99,
-            num_sgd_iter=6,
+            num_epochs=6,
             vf_loss_coeff=0.01,
         )
     )
diff --git a/rllib/examples/multi_agent/multi_agent_pendulum.py b/rllib/examples/multi_agent/multi_agent_pendulum.py
index 80aa2441692e..74ed6045673e 100644
--- a/rllib/examples/multi_agent/multi_agent_pendulum.py
+++ b/rllib/examples/multi_agent/multi_agent_pendulum.py
@@ -49,7 +49,7 @@
         .environment("env" if args.num_agents > 0 else "Pendulum-v1")
         .training(
             train_batch_size_per_learner=512,
-            mini_batch_size_per_learner=64,
+            minibatch_size=64,
             lambda_=0.1,
             gamma=0.95,
             lr=0.0003,
diff --git a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py
index c4fe7e30e814..d1670c3be9c9 100644
--- a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py
+++ b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py
@@ -185,7 +185,7 @@ def _get_multi_agent():
             num_cpus_for_main_process=1,
         )
         .training(
-            num_sgd_iter=20,
+            num_epochs=20,
             model=dict(
                 **({"uses_new_env_runners": True} if args.enable_new_api_stack else {}),
             ),
diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py
index 3c01d25a244c..7420e2604790 100644
--- a/rllib/examples/multi_agent/self_play_with_open_spiel.py
+++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py
@@ -173,9 +173,9 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs):
         )
     )
 
-    # Only for PPO, change the `num_sgd_iter` setting.
+    # Only for PPO, change the `num_epochs` setting.
     if args.algo == "PPO":
-        config.training(num_sgd_iter=20)
+        config.training(num_epochs=20)
 
     stop = {
         NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
diff --git a/rllib/examples/multi_agent/two_algorithms.py b/rllib/examples/multi_agent/two_algorithms.py
index f77c6d0d5c3b..21169110cf7d 100644
--- a/rllib/examples/multi_agent/two_algorithms.py
+++ b/rllib/examples/multi_agent/two_algorithms.py
@@ -95,7 +95,7 @@ def select_policy(algorithm, framework):
         .training(
             model={"vf_share_layers": True},
             vf_loss_coeff=0.01,
-            num_sgd_iter=6,
+            num_epochs=6,
         )
         # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
         .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 4a93fcdbef6c..4afeced1e25d 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -104,7 +104,7 @@ def create_quadx_waypoints_env(env_config):
             }
         )
         config.training(
-            sgd_minibatch_size=128,
+            minibatch_size=128,
             train_batch_size_per_learner=10000,
         )
     # If IMPALA set additional arguments.
diff --git a/rllib/examples/rl_modules/classes/lstm_containing_rlm.py b/rllib/examples/rl_modules/classes/lstm_containing_rlm.py
index 993df559301b..c2dd1c230d2d 100644
--- a/rllib/examples/rl_modules/classes/lstm_containing_rlm.py
+++ b/rllib/examples/rl_modules/classes/lstm_containing_rlm.py
@@ -13,7 +13,7 @@
 torch, nn = try_import_torch()
 
 
-class LSTMContainingRLModule(TorchRLModule):
+class LSTMContainingRLModule(TorchRLModule, ValueFunctionAPI):
     """An example TorchRLModule that contains an LSTM layer.
 
     .. testcode::
diff --git a/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/rllib/examples/rl_modules/classes/mobilenet_rlm.py
index 49878ec555f9..7cd87d5b8922 100644
--- a/rllib/examples/rl_modules/classes/mobilenet_rlm.py
+++ b/rllib/examples/rl_modules/classes/mobilenet_rlm.py
@@ -75,7 +75,7 @@ def setup(self):
     # The following training settings make it so that a training iteration is very
     # quick. This is just for the sake of this example. PPO will not learn properly
     # with these settings!
-    .training(train_batch_size=32, sgd_minibatch_size=16, num_sgd_iter=1)
+    .training(train_batch_size=32, minibatch_size=16, num_epochs=1)
 )
 
 config.build().train()
diff --git a/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py b/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py
index 6089583a31b3..88ea754b7217 100644
--- a/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py
+++ b/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py
@@ -17,7 +17,7 @@
 torch, nn = try_import_torch()
 
 
-class TinyAtariCNN(TorchRLModule):
+class TinyAtariCNN(TorchRLModule, ValueFunctionAPI):
     """A tiny CNN stack for fast-learning of Atari envs.
 
     The architecture here is the exact same as the one used by the old API stack as
diff --git a/rllib/examples/rl_modules/custom_lstm_rl_module.py b/rllib/examples/rl_modules/custom_lstm_rl_module.py
index 3d3cf285eb19..14285d16e5b6 100644
--- a/rllib/examples/rl_modules/custom_lstm_rl_module.py
+++ b/rllib/examples/rl_modules/custom_lstm_rl_module.py
@@ -80,7 +80,7 @@
         )
         .training(
             train_batch_size_per_learner=1024,
-            num_sgd_iter=6,
+            num_epochs=6,
             lr=0.0009,
             vf_loss_coeff=0.001,
             entropy_coeff=0.0,
diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py
index b78453d3e9d0..bf930a00f5e2 100644
--- a/rllib/execution/train_ops.py
+++ b/rllib/execution/train_ops.py
@@ -45,14 +45,14 @@ def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict:
     config = algorithm.config
     workers = algorithm.env_runner_group
     local_worker = workers.local_env_runner
-    num_sgd_iter = config.get("num_sgd_iter", 1)
-    sgd_minibatch_size = config.get("sgd_minibatch_size", 0)
+    num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1))
+    minibatch_size = config.get("minibatch_size", config.get("sgd_minibatch_size", 0))
 
     learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER]
     with learn_timer:
-        # Subsample minibatches (size=`sgd_minibatch_size`) from the
+        # Subsample minibatches (size=`minibatch_size`) from the
         # train batch and loop through train batch `num_sgd_iter` times.
-        if num_sgd_iter > 1 or sgd_minibatch_size > 0:
+        if num_sgd_iter > 1 or minibatch_size > 0:
             info = do_minibatch_sgd(
                 train_batch,
                 {
@@ -62,7 +62,7 @@ def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict:
                 },
                 local_worker,
                 num_sgd_iter,
-                sgd_minibatch_size,
+                minibatch_size,
                 [],
             )
         # Single update step using train batch.
@@ -114,15 +114,15 @@ def multi_gpu_train_one_step(algorithm, train_batch) -> Dict:
     config = algorithm.config
     workers = algorithm.env_runner_group
     local_worker = workers.local_env_runner
-    num_sgd_iter = config.get("num_sgd_iter", 1)
-    sgd_minibatch_size = config.get("sgd_minibatch_size", config["train_batch_size"])
+    num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1))
+    minibatch_size = config.get("minibatch_size", config["train_batch_size"])
 
     # Determine the number of devices (GPUs or 1 CPU) we use.
     num_devices = int(math.ceil(config["num_gpus"] or 1))
 
     # Make sure total batch size is dividable by the number of devices.
     # Batch size per tower.
-    per_device_batch_size = sgd_minibatch_size // num_devices
+    per_device_batch_size = minibatch_size // num_devices
     # Total batch size.
     batch_size = per_device_batch_size * num_devices
     assert batch_size % num_devices == 0
diff --git a/rllib/models/tests/test_attention_nets.py b/rllib/models/tests/test_attention_nets.py
index 1ccc216aec3c..bed5ad726fbc 100644
--- a/rllib/models/tests/test_attention_nets.py
+++ b/rllib/models/tests/test_attention_nets.py
@@ -68,9 +68,9 @@ def test_attention_nets_w_prev_actions_and_prev_rewards(self):
                 "attention_use_n_prev_actions": 3,
                 "attention_use_n_prev_rewards": 2,
             },
-            "num_sgd_iter": 1,
+            "num_epochs": 1,
             "train_batch_size": 200,
-            "sgd_minibatch_size": 50,
+            "minibatch_size": 50,
             "rollout_fragment_length": 100,
             "num_env_runners": 1,
         }
@@ -88,7 +88,7 @@ def test_ppo_attention_net_learning(self):
                 "num_env_runners": 0,
                 "entropy_coeff": 0.001,
                 "vf_loss_coeff": 1e-5,
-                "num_sgd_iter": 5,
+                "num_epochs": 5,
                 "model": {
                     "custom_model": "attention_net",
                     "max_seq_len": 10,
diff --git a/rllib/models/tests/test_lstms.py b/rllib/models/tests/test_lstms.py
index c8d204b395e5..b49e0db2628f 100644
--- a/rllib/models/tests/test_lstms.py
+++ b/rllib/models/tests/test_lstms.py
@@ -49,9 +49,9 @@ def test_lstm_w_prev_action_and_prev_reward(self):
                     "lstm_use_prev_action": True,
                     "lstm_use_prev_reward": True,
                 },
-                num_sgd_iter=1,
+                num_epochs=1,
                 train_batch_size=200,
-                sgd_minibatch_size=50,
+                minibatch_size=50,
             )
             .env_runners(
                 rollout_fragment_length=100,
diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py
index 64b0836caec6..aa5e5f3758d2 100644
--- a/rllib/models/tests/test_preprocessors.py
+++ b/rllib/models/tests/test_preprocessors.py
@@ -52,8 +52,8 @@ def test_rlms_and_preprocessing(self):
             .env_runners(num_env_runners=0)
             .training(
                 train_batch_size=10,
-                sgd_minibatch_size=1,
-                num_sgd_iter=1,
+                minibatch_size=1,
+                num_epochs=1,
             )
             # Set this to True to enforce no preprocessors being used.
             .experimental(_disable_preprocessor_api=True)
@@ -90,7 +90,7 @@ def test_preprocessing_disabled_modelv2(self):
             )
             # Speed things up a little.
             .env_runners(rollout_fragment_length=5)
-            .training(train_batch_size=100, sgd_minibatch_size=10, num_sgd_iter=1)
+            .training(train_batch_size=100, minibatch_size=10, num_epochs=1)
             .debugging(seed=42)
             # Set this to True to enforce no preprocessors being used.
             # Complex observations now arrive directly in the model as
diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py
index edda8c818b5f..efd7b4024131 100644
--- a/rllib/policy/dynamic_tf_policy.py
+++ b/rllib/policy/dynamic_tf_policy.py
@@ -618,7 +618,8 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
             # Get the correct slice of the already loaded batch to use,
             # based on offset and batch size.
             batch_size = self.config.get(
-                "sgd_minibatch_size", self.config["train_batch_size"]
+                "minibatch_size",
+                self.config.get("sgd_minibatch_size", self.config["train_batch_size"]),
             )
             if batch_size >= len(self._loaded_single_cpu_batch):
                 sliced_batch = self._loaded_single_cpu_batch
@@ -972,7 +973,7 @@ def __init__(
             self.max_per_device_batch_size = (
                 max_per_device_batch_size
                 or policy.config.get(
-                    "sgd_minibatch_size", policy.config.get("train_batch_size", 999999)
+                    "minibatch_size", policy.config.get("train_batch_size", 999999)
                 )
             ) // len(self.devices)
             input_placeholders = tree.flatten(self.policy._loss_input_dict_no_rnn)
@@ -1181,7 +1182,7 @@ def load_data(self, sess, inputs, state_inputs, num_grad_updates=None):
         if sequences_per_minibatch < len(self.devices):
             raise ValueError(
                 "Must load at least 1 tuple sequence per device. Try "
-                "increasing `sgd_minibatch_size` or reducing `max_seq_len` "
+                "increasing `minibatch_size` or reducing `max_seq_len` "
                 "to ensure that at least one sequence fits per device."
             )
         self._loaded_per_device_batch_size = (
diff --git a/rllib/policy/dynamic_tf_policy_v2.py b/rllib/policy/dynamic_tf_policy_v2.py
index 1f1d41aa1760..f11cba1ee57d 100644
--- a/rllib/policy/dynamic_tf_policy_v2.py
+++ b/rllib/policy/dynamic_tf_policy_v2.py
@@ -1004,8 +1004,10 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
             # Get the correct slice of the already loaded batch to use,
             # based on offset and batch size.
             batch_size = self.config.get(
-                "sgd_minibatch_size", self.config["train_batch_size"]
+                "minibatch_size",
+                self.config.get("sgd_minibatch_size", self.config["train_batch_size"]),
             )
+
             if batch_size >= len(self._loaded_single_cpu_batch):
                 sliced_batch = self._loaded_single_cpu_batch
             else:
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index 6d53b78da360..e4db6d37a5c0 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -549,7 +549,7 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
         # Get the correct slice of the already loaded batch to use,
         # based on offset and batch size.
         device_batch_size = self.config.get(
-            "sgd_minibatch_size", self.config["train_batch_size"]
+            "minibatch_size", self.config["train_batch_size"]
         ) // len(self.devices)
 
         # Set Model to train mode.
diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index a86236108ac1..649fc19f88e3 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -839,7 +839,8 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
         # Get the correct slice of the already loaded batch to use,
         # based on offset and batch size.
         device_batch_size = self.config.get(
-            "sgd_minibatch_size", self.config["train_batch_size"]
+            "minibatch_size",
+            self.config.get("sgd_minibatch_size", self.config["train_batch_size"]),
         ) // len(self.devices)
 
         # Set Model to train mode.
diff --git a/rllib/tests/backward_compat/checkpoints/create_checkpoints.py b/rllib/tests/backward_compat/checkpoints/create_checkpoints.py
index 952d299d385f..d66bcd1f87cf 100644
--- a/rllib/tests/backward_compat/checkpoints/create_checkpoints.py
+++ b/rllib/tests/backward_compat/checkpoints/create_checkpoints.py
@@ -9,7 +9,7 @@
     PPOConfig()
     .environment("FrozenLake-v1")
     .training(
-        num_sgd_iter=2,
+        num_epochs=2,
         model=dict(
             fcnet_hiddens=[10],
         ),
diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py
index 0fe968a2ae61..4207336bc99a 100644
--- a/rllib/tests/test_io.py
+++ b/rllib/tests/test_io.py
@@ -195,7 +195,7 @@ def test_agent_input_list(self):
         config = (
             PPOConfig()
             .environment("CartPole-v1")
-            .training(train_batch_size=98, sgd_minibatch_size=49)
+            .training(train_batch_size=98, minibatch_size=49)
             .evaluation(off_policy_estimation_methods={})
         )
 
diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py
index d93951be0f67..eda9a0c3e440 100644
--- a/rllib/tests/test_lstm.py
+++ b/rllib/tests/test_lstm.py
@@ -183,8 +183,8 @@ def test_simple_optimizer_sequencing(self):
             .env_runners(num_env_runners=0, rollout_fragment_length=10)
             .training(
                 train_batch_size=10,
-                sgd_minibatch_size=10,
-                num_sgd_iter=1,
+                minibatch_size=10,
+                num_epochs=1,
                 model={
                     "custom_model": "rnn",
                     "max_seq_len": 4,
@@ -254,8 +254,8 @@ def test_minibatch_sequencing(self):
             .env_runners(num_env_runners=0, rollout_fragment_length=20)
             .training(
                 train_batch_size=20,
-                sgd_minibatch_size=10,
-                num_sgd_iter=1,
+                minibatch_size=10,
+                num_epochs=1,
                 model={
                     "custom_model": "rnn",
                     "max_seq_len": 4,
diff --git a/rllib/tests/test_nested_observation_spaces.py b/rllib/tests/test_nested_observation_spaces.py
index b4d236341f71..402cf859b8cf 100644
--- a/rllib/tests/test_nested_observation_spaces.py
+++ b/rllib/tests/test_nested_observation_spaces.py
@@ -399,8 +399,8 @@ def test_torch_model(self):
             .env_runners(num_env_runners=0, rollout_fragment_length=5)
             .training(
                 train_batch_size=5,
-                sgd_minibatch_size=5,
-                num_sgd_iter=1,
+                minibatch_size=5,
+                num_epochs=1,
                 model={"custom_model": "composite"},
             )
         )
@@ -441,8 +441,8 @@ def test_torch_repeated(self):
             .env_runners(num_env_runners=0, rollout_fragment_length=5)
             .training(
                 train_batch_size=5,
-                num_sgd_iter=1,
-                sgd_minibatch_size=5,
+                num_epochs=1,
+                minibatch_size=5,
                 model={"custom_model": "r1"},
             )
         )
diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py
index 469dba2ea790..a6e8c52ae76c 100644
--- a/rllib/tests/test_supported_multi_agent.py
+++ b/rllib/tests/test_supported_multi_agent.py
@@ -65,7 +65,7 @@ def test_ppo_multiagent(self):
             (
                 PPOConfig()
                 .env_runners(num_env_runners=1, rollout_fragment_length=10)
-                .training(num_sgd_iter=1, train_batch_size=10, sgd_minibatch_size=1)
+                .training(num_epochs=1, train_batch_size=10, minibatch_size=1)
             ),
         )
 
diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py
index 58b8d50a4150..765cea010f3d 100644
--- a/rllib/tests/test_supported_spaces.py
+++ b/rllib/tests/test_supported_spaces.py
@@ -74,8 +74,8 @@ def test_ppo(self):
             .env_runners(num_env_runners=2, rollout_fragment_length=50)
             .training(
                 train_batch_size=100,
-                num_sgd_iter=1,
-                sgd_minibatch_size=50,
+                num_epochs=1,
+                minibatch_size=50,
                 model={
                     "fcnet_hiddens": [10],
                 },
@@ -103,8 +103,8 @@ def test_ppo_no_preprocessors_gpu(self):
             .env_runners(num_env_runners=2, rollout_fragment_length=50)
             .training(
                 train_batch_size=100,
-                num_sgd_iter=1,
-                sgd_minibatch_size=50,
+                num_epochs=1,
+                minibatch_size=50,
                 model={
                     "fcnet_hiddens": [10],
                 },
diff --git a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py
index c0a9d18eed8b..f75c42912134 100644
--- a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py
+++ b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py
@@ -23,7 +23,7 @@
         _separate_vf_optimizer=True,
         # Separate learning rate (and schedule) for the value function branch.
         _lr_vf=tune.grid_search([0.00075, [[0, 0.00075], [100000, 0.0003]]]),
-        num_sgd_iter=6,
+        num_epochs=6,
         # `vf_loss_coeff` will be ignored anyways as we use separate loss terms.
         vf_loss_coeff=0.01,
         vtrace=True,
diff --git a/rllib/tuned_examples/appo/cartpole-appo.yaml b/rllib/tuned_examples/appo/cartpole-appo.yaml
index 03cd464cf495..bfceaddcf02f 100644
--- a/rllib/tuned_examples/appo/cartpole-appo.yaml
+++ b/rllib/tuned_examples/appo/cartpole-appo.yaml
@@ -12,7 +12,7 @@ cartpole-appo:
         num_env_runners: 4
         num_gpus: 0
         observation_filter: MeanStdFilter
-        num_sgd_iter: 1
+        num_epochs: 1
         vf_loss_coeff: 0.01
         vtrace: true
         model:
diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py
index b84e7d2b6cf9..865c4ce85c31 100644
--- a/rllib/tuned_examples/appo/cartpole_appo.py
+++ b/rllib/tuned_examples/appo/cartpole_appo.py
@@ -6,7 +6,10 @@
 )
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
-parser = add_rllib_example_script_args()
+parser = add_rllib_example_script_args(
+    default_reward=450.0,
+    default_timesteps=2000000,
+)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.
@@ -22,24 +25,24 @@
     )
     .environment("CartPole-v1")
     .training(
+        train_batch_size_per_learner=1000,
         vf_loss_coeff=0.05,
-        entropy_coeff=0.0,
+        entropy_coeff=0.01,
+        num_epochs=2,
+        lr=0.00075,
+        minibatch_size=250,
     )
     .rl_module(
         model_config_dict={
-            "vf_share_layers": True,
+            "fcnet_hiddens": [32],
+            #"vf_share_layers": True,
             "uses_new_env_runners": True,
         },
     )
 )
 
-stop = {
-    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 450.0,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME: 2000000,
-}
-
 
 if __name__ == "__main__":
     from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
 
-    run_rllib_example_script_experiment(config, args, stop=stop)
+    run_rllib_example_script_experiment(config, args)
diff --git a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml
index c8f5d37cb971..5af435924178 100644
--- a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml
+++ b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml
@@ -29,5 +29,5 @@ frozenlake-appo-vtrace:
         num_envs_per_env_runner: 5
         num_env_runners: 4
         num_gpus: 0
-        num_sgd_iter: 1
+        num_epochs: 1
         vf_loss_coeff: 0.01
diff --git a/rllib/tuned_examples/appo/halfcheetah-appo.yaml b/rllib/tuned_examples/appo/halfcheetah-appo.yaml
index 0102b15d999b..169e4e82b184 100644
--- a/rllib/tuned_examples/appo/halfcheetah-appo.yaml
+++ b/rllib/tuned_examples/appo/halfcheetah-appo.yaml
@@ -21,7 +21,7 @@ halfcheetah-appo:
         num_multi_gpu_tower_stacks: 1
         num_envs_per_env_runner: 32
         minibatch_buffer_size: 16
-        num_sgd_iter: 32
+        num_epochs: 32
         clip_param: 0.2
         lr_schedule: [
             [0, 0.0005],
diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py
index 071cae713fc3..091be32489c3 100644
--- a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py
+++ b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py
@@ -35,7 +35,7 @@
             "fcnet_activation": "linear",
             "vf_share_layers": True,
         },
-        num_sgd_iter=1,
+        num_epochs=1,
         vf_loss_coeff=0.005,
         vtrace=True,
     )
diff --git a/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py b/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py
index 95277a40920a..a1ed308c55a4 100644
--- a/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py
+++ b/rllib/tuned_examples/appo/multi_agent_cartpole_appo_old_api_stack.py
@@ -25,7 +25,7 @@
         policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"),
     )
     .training(
-        num_sgd_iter=1,
+        num_epochs=1,
         vf_loss_coeff=0.005,
         vtrace=True,
         model={
diff --git a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py
index a8713f4350d3..31ae4c95a90d 100644
--- a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py
+++ b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py
@@ -9,10 +9,7 @@
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 from ray.tune.registry import register_env
 
-parser = add_rllib_example_script_args(
-    default_timesteps=2000000,
-    default_reward=350.0,
-)
+parser = add_rllib_example_script_args(default_timesteps=2000000)
 parser.set_defaults(
     enable_new_api_stack=True,
     num_agents=2,
@@ -39,7 +36,7 @@
     .training(
         train_batch_size_per_learner=600,
         lr=0.0005 * ((args.num_gpus or 1) ** 0.5),
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.05,
         grad_clip=20.0,
     )
diff --git a/rllib/tuned_examples/appo/pendulum-appo.yaml b/rllib/tuned_examples/appo/pendulum-appo.yaml
index dd274338e1f2..6e9f544af4d9 100644
--- a/rllib/tuned_examples/appo/pendulum-appo.yaml
+++ b/rllib/tuned_examples/appo/pendulum-appo.yaml
@@ -16,7 +16,7 @@ pendulum-appo-vtrace:
         lr: 0.0003
         train_batch_size: 100
         minibatch_buffer_size: 16
-        num_sgd_iter: 10
+        num_epochs: 10
         model:
             fcnet_hiddens: [256, 256]
         batch_mode: truncate_episodes
diff --git a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml
index e3a88164a579..94088ab67c29 100644
--- a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml
+++ b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml
@@ -26,7 +26,7 @@ appo-pongnoframeskip-v5:
         broadcast_interval: 1
         max_sample_requests_in_flight_per_worker: 1
         num_envs_per_env_runner: 8
-        num_sgd_iter: 2
+        num_epochs: 2
         vf_loss_coeff: 1.0
         clip_param: 0.3
 
diff --git a/rllib/tuned_examples/appo/pong-appo.yaml b/rllib/tuned_examples/appo/pong-appo.yaml
index af2e2afe7248..837e0559a8f8 100644
--- a/rllib/tuned_examples/appo/pong-appo.yaml
+++ b/rllib/tuned_examples/appo/pong-appo.yaml
@@ -28,7 +28,7 @@ pong-appo:
         num_multi_gpu_tower_stacks: 1
         num_envs_per_env_runner: 8
         minibatch_buffer_size: 4
-        num_sgd_iter: 2
+        num_epochs: 2
         vf_loss_coeff: 1.0
         clip_param: 0.3
         num_gpus: 1
diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo.py b/rllib/tuned_examples/appo/stateless_cartpole_appo.py
index 774f3764b738..b6672758dab6 100644
--- a/rllib/tuned_examples/appo/stateless_cartpole_appo.py
+++ b/rllib/tuned_examples/appo/stateless_cartpole_appo.py
@@ -29,7 +29,7 @@
     )
     .training(
         lr=0.0005 * ((args.num_gpus or 1) ** 0.5),
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.05,
         grad_clip=20.0,
     )
diff --git a/rllib/tuned_examples/bc/cartpole_recording.py b/rllib/tuned_examples/bc/cartpole_recording.py
index 673a48c75900..f05cdcf8c6dc 100644
--- a/rllib/tuned_examples/bc/cartpole_recording.py
+++ b/rllib/tuned_examples/bc/cartpole_recording.py
@@ -33,7 +33,7 @@
     .training(
         gamma=0.99,
         lr=0.0003,
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.01,
         use_kl_loss=True,
     )
diff --git a/rllib/tuned_examples/compact-regression-test.yaml b/rllib/tuned_examples/compact-regression-test.yaml
index 74a89ed0a650..21dbdb6d1be4 100644
--- a/rllib/tuned_examples/compact-regression-test.yaml
+++ b/rllib/tuned_examples/compact-regression-test.yaml
@@ -41,8 +41,8 @@ atari-ppo-tf:
         entropy_coeff: 0.01
         train_batch_size: 5000
         rollout_fragment_length: 100
-        sgd_minibatch_size: 500
-        num_sgd_iter: 10
+        minibatch_size: 500
+        num_epochs: 10
         num_env_runners: 10
         num_envs_per_env_runner: 5
         batch_mode: truncate_episodes
@@ -68,8 +68,8 @@ atari-ppo-torch:
         entropy_coeff: 0.01
         train_batch_size: 5000
         rollout_fragment_length: 100
-        sgd_minibatch_size: 500
-        num_sgd_iter: 10
+        minibatch_size: 500
+        num_epochs: 10
         num_env_runners: 10
         num_envs_per_env_runner: 5
         batch_mode: truncate_episodes
diff --git a/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py b/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py
index 2f890e68308f..95b6feb478d5 100644
--- a/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py
+++ b/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py
@@ -23,7 +23,7 @@
         _separate_vf_optimizer=True,
         # Separate learning rate for the value function branch.
         _lr_vf=0.00075,
-        num_sgd_iter=6,
+        num_epochs=6,
         # `vf_loss_coeff` will be ignored anyways as we use separate loss terms.
         vf_loss_coeff=0.01,
         vtrace=True,
diff --git a/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py b/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py
index d1748fef0911..0fdf075802d6 100644
--- a/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py
+++ b/rllib/tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py
@@ -25,7 +25,7 @@
         policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"),
     )
     .training(
-        num_sgd_iter=1,
+        num_epochs=1,
         vf_loss_coeff=0.005,
         vtrace=True,
         model={
diff --git a/rllib/tuned_examples/ppo/atari_ppo.py b/rllib/tuned_examples/ppo/atari_ppo.py
index 5f06866894a6..8c6629d7ee05 100644
--- a/rllib/tuned_examples/ppo/atari_ppo.py
+++ b/rllib/tuned_examples/ppo/atari_ppo.py
@@ -59,13 +59,13 @@ def _env_creator(cfg):
     .training(
         learner_connector=_make_learner_connector,
         train_batch_size_per_learner=4000,  # 5000 on old yaml example
-        mini_batch_size_per_learner=128,  # 500 on old yaml example
+        minibatch_size=128,  # 500 on old yaml example
         lambda_=0.95,
         kl_coeff=0.5,
         clip_param=0.1,
         vf_clip_param=10.0,
         entropy_coeff=0.01,
-        num_sgd_iter=10,
+        num_epochs=10,
         lr=0.00015 * args.num_gpus,
         grad_clip=100.0,
         grad_clip_by="global_norm",
diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py
index e266f1b64902..18de125d7f06 100644
--- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py
+++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py
@@ -107,9 +107,9 @@ def stop_all(self):
         gamma=0.99,
         lambda_=0.95,
         lr=0.0003,
-        num_sgd_iter=15,
+        num_epochs=15,
         train_batch_size=32 * 512,
-        sgd_minibatch_size=4096,
+        minibatch_size=4096,
         vf_loss_coeff=0.01,
         model={
             "fcnet_hiddens": [64, 64],
diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py
index 841ee40a52e1..8116a2431cd5 100644
--- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py
+++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py
@@ -51,8 +51,8 @@
         "vf_loss_coeff": [0.01, 1.0],
         "clip_param": [0.1, 0.3],
         "kl_target": [0.01, 0.03],
-        "sgd_minibatch_size": [512, 4096],
-        "num_sgd_iter": [6, 32],
+        "minibatch_size": [512, 4096],
+        "num_epochs": [6, 32],
         "vf_share_layers": [False, True],
         "use_kl_loss": [False, True],
         "kl_coeff": [0.1, 0.4],
@@ -96,15 +96,15 @@
             vf_loss_coeff=tune.uniform(0.01, 1.0),
             clip_param=tune.uniform(0.1, 0.3),
             kl_target=tune.uniform(0.01, 0.03),
-            sgd_minibatch_size=tune.choice([512, 1024, 2048, 4096]),
-            num_sgd_iter=tune.randint(6, 32),
+            minibatch_size=tune.choice([512, 1024, 2048, 4096]),
+            num_epochs=tune.randint(6, 32),
             vf_share_layers=tune.choice([True, False]),
             use_kl_loss=tune.choice([True, False]),
             kl_coeff=tune.uniform(0.1, 0.4),
             vf_clip_param=tune.choice([10.0, 40.0, float("inf")]),
             grad_clip=tune.choice([None, 40, 100, 200]),
             train_batch_size=tune.sample_from(
-                lambda spec: spec.config["sgd_minibatch_size"] * num_rollout_workers
+                lambda spec: spec.config["minibatch_size"] * num_rollout_workers
             ),
             model={
                 "fcnet_hiddens": [64, 64],
diff --git a/rllib/tuned_examples/ppo/cartpole-ppo.yaml b/rllib/tuned_examples/ppo/cartpole-ppo.yaml
index 2042d496b464..94a093eec3b3 100644
--- a/rllib/tuned_examples/ppo/cartpole-ppo.yaml
+++ b/rllib/tuned_examples/ppo/cartpole-ppo.yaml
@@ -11,7 +11,7 @@ cartpole-ppo:
         gamma: 0.99
         lr: 0.0003
         num_env_runners: 1
-        num_sgd_iter: 6
+        num_epochs: 6
         vf_loss_coeff: 0.01
         model:
             fcnet_hiddens: [32]
diff --git a/rllib/tuned_examples/ppo/cartpole_ppo.py b/rllib/tuned_examples/ppo/cartpole_ppo.py
index cc9171ee5fc7..612f267f188a 100644
--- a/rllib/tuned_examples/ppo/cartpole_ppo.py
+++ b/rllib/tuned_examples/ppo/cartpole_ppo.py
@@ -31,7 +31,7 @@
     .training(
         gamma=0.99,
         lr=0.0003,
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.01,
         use_kl_loss=True,
     )
diff --git a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py
index a51e21a48b5f..57f1ecffda4a 100644
--- a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py
+++ b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py
@@ -35,7 +35,7 @@
     .training(
         gamma=0.99,
         lr=0.0003,
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.01,
         use_kl_loss=True,
     )
diff --git a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml
index 0e050266a96d..96fded2c6a1c 100644
--- a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml
+++ b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml
@@ -11,11 +11,11 @@ halfcheetah-ppo:
         gamma: 0.99
         lambda: 0.95
         kl_coeff: 1.0
-        num_sgd_iter: 32
+        num_epochs: 32
         lr: .0003
         vf_loss_coeff: 0.5
         clip_param: 0.2
-        sgd_minibatch_size: 4096
+        minibatch_size: 4096
         train_batch_size: 65536
         num_env_runners: 16
         num_gpus: 1
diff --git a/rllib/tuned_examples/ppo/hopper-ppo.yaml b/rllib/tuned_examples/ppo/hopper-ppo.yaml
index a12df2073ee5..3ad4890618f5 100644
--- a/rllib/tuned_examples/ppo/hopper-ppo.yaml
+++ b/rllib/tuned_examples/ppo/hopper-ppo.yaml
@@ -7,9 +7,9 @@ hopper-ppo:
         framework: torch
         gamma: 0.995
         kl_coeff: 1.0
-        num_sgd_iter: 20
+        num_epochs: 20
         lr: .0001
-        sgd_minibatch_size: 32768
+        minibatch_size: 32768
         train_batch_size: 160000
         num_env_runners: 64
         num_gpus: 4
diff --git a/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml b/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml
index ace034852908..779e42f50626 100644
--- a/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml
+++ b/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml
@@ -11,9 +11,9 @@ humanoid-ppo-gae:
         lambda: 0.95
         clip_param: 0.2
         kl_coeff: 1.0
-        num_sgd_iter: 20
+        num_epochs: 20
         lr: .0001
-        sgd_minibatch_size: 32768
+        minibatch_size: 32768
         horizon: 5000
         train_batch_size: 320000
         model:
diff --git a/rllib/tuned_examples/ppo/humanoid-ppo.yaml b/rllib/tuned_examples/ppo/humanoid-ppo.yaml
index 5a26d07172eb..8a22c2e9607c 100644
--- a/rllib/tuned_examples/ppo/humanoid-ppo.yaml
+++ b/rllib/tuned_examples/ppo/humanoid-ppo.yaml
@@ -9,9 +9,9 @@ humanoid-ppo:
         framework: torch
         gamma: 0.995
         kl_coeff: 1.0
-        num_sgd_iter: 20
+        num_epochs: 20
         lr: .0001
-        sgd_minibatch_size: 32768
+        minibatch_size: 32768
         train_batch_size: 320000
         model:
             free_log_std: true
diff --git a/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml b/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml
index 5eafdd533401..631e65216953 100644
--- a/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml
+++ b/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml
@@ -13,5 +13,5 @@ memory-leak-test-ppo:
         num_env_runners: 4
         num_envs_per_env_runner: 5
         train_batch_size: 500
-        sgd_minibatch_size: 256
-        num_sgd_iter: 5
+        minibatch_size: 256
+        num_epochs: 5
diff --git a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py
index 65c6d3dc4261..bd3794daf41d 100644
--- a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py
+++ b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py
@@ -14,5 +14,5 @@
         num_env_runners=4,
         num_envs_per_env_runner=5,
     )
-    .training(train_batch_size=500, sgd_minibatch_size=256, num_sgd_iter=5)
+    .training(train_batch_size=500, minibatch_size=256, num_epochs=5)
 )
diff --git a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py
index 054cfc056831..0dd22ed050a1 100644
--- a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py
+++ b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py
@@ -36,7 +36,7 @@
     .training(
         gamma=0.99,
         lr=0.0003,
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.01,
         use_kl_loss=True,
     )
diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
index 082d505efcce..42f0398a97bd 100644
--- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
@@ -31,7 +31,7 @@
         lr=0.0003,
         lambda_=0.1,
         vf_clip_param=10.0,
-        num_sgd_iter=6,
+        num_epochs=6,
     )
     .rl_module(
         model_config_dict={
diff --git a/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py b/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py
index a0b515c7f103..f307dd726fd6 100644
--- a/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py
+++ b/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py
@@ -38,7 +38,7 @@
     .training(
         lr=0.0003 * ((args.num_gpus or 1) ** 0.5),
         gamma=0.99,
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.05,
         use_kl_loss=True,
     )
diff --git a/rllib/tuned_examples/ppo/pendulum-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-ppo.yaml
index ae60cfd07ec4..7ab57c621a97 100644
--- a/rllib/tuned_examples/ppo/pendulum-ppo.yaml
+++ b/rllib/tuned_examples/ppo/pendulum-ppo.yaml
@@ -16,7 +16,7 @@ pendulum-ppo:
         lambda: 0.1
         gamma: 0.95
         lr: 0.0003
-        sgd_minibatch_size: 64
+        minibatch_size: 64
         observation_filter: MeanStdFilter
         model:
             fcnet_activation: relu
diff --git a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml
index e573eabbfe72..04a12eb3c46d 100644
--- a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml
+++ b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml
@@ -24,8 +24,8 @@ pendulum-ppo:
         gamma: 0.95
         lr: 0.0003
         train_batch_size: 512
-        sgd_minibatch_size: 64
-        num_sgd_iter: 6
+        minibatch_size: 64
+        num_epochs: 6
         observation_filter: MeanStdFilter
         model:
             fcnet_activation: relu
diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py
index 84c0ddd74f90..9ffd945e0979 100644
--- a/rllib/tuned_examples/ppo/pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/pendulum_ppo.py
@@ -23,7 +23,7 @@
         lr=0.0003,
         lambda_=0.1,
         vf_clip_param=10.0,
-        num_sgd_iter=6,
+        num_epochs=6,
     )
     .rl_module(
         model_config_dict={
diff --git a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml
index d59329616193..490b63245f15 100644
--- a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml
+++ b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml
@@ -17,7 +17,7 @@ repeat-after-me-ppo-w-lstm:
         lr: 0.0003
         num_env_runners: 0
         num_envs_per_env_runner: 20
-        num_sgd_iter: 5
+        num_epochs: 5
         entropy_coeff: 0.00001
         model:
             use_lstm: true
diff --git a/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py b/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py
index 0df7a29abbdf..9e188d3982f4 100644
--- a/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py
+++ b/rllib/tuned_examples/ppo/stateless_cartpole_ppo.py
@@ -30,7 +30,7 @@
     .training(
         lr=0.0003 * ((args.num_gpus or 1) ** 0.5),
         gamma=0.99,
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.05,
         use_kl_loss=True,
     )
diff --git a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml
index cdcfee928c0f..c6ceb1461149 100644
--- a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml
+++ b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml
@@ -30,13 +30,13 @@ unity3d-soccer-strikers-vs-goalie-ppo:
         lr: 0.0003
         lambda: 0.95
         gamma: 0.99
-        sgd_minibatch_size: 256
+        minibatch_size: 256
         train_batch_size: 4000
         clip_param: 0.2
         # For running in editor, just use one Worker (we only have
         # one Unity running)!
         num_env_runners: 10
-        num_sgd_iter: 20
+        num_epochs: 20
         rollout_fragment_length: 200
         model:
           fcnet_hiddens: [512, 512]
diff --git a/rllib/tuned_examples/ppo/walker2d-ppo.yaml b/rllib/tuned_examples/ppo/walker2d-ppo.yaml
index 13305acc0c9a..9429f0d4161d 100644
--- a/rllib/tuned_examples/ppo/walker2d-ppo.yaml
+++ b/rllib/tuned_examples/ppo/walker2d-ppo.yaml
@@ -6,9 +6,9 @@ walker2d-v1-ppo:
         # Works for both torch and tf.
         framework: torch
         kl_coeff: 1.0
-        num_sgd_iter: 20
+        num_epochs: 20
         lr: .0001
-        sgd_minibatch_size: 32768
+        minibatch_size: 32768
         train_batch_size: 320000
         num_env_runners: 64
         num_gpus: 4
diff --git a/rllib/utils/exploration/tests/test_curiosity.py b/rllib/utils/exploration/tests/test_curiosity.py
index ddc5939c5df5..4531154371f0 100644
--- a/rllib/utils/exploration/tests/test_curiosity.py
+++ b/rllib/utils/exploration/tests/test_curiosity.py
@@ -263,7 +263,7 @@ def test_curiosity_on_partially_observable_domain(self):
                     "fcnet_hiddens": [256, 256],
                     "fcnet_activation": "relu",
                 },
-                num_sgd_iter=8,
+                num_epochs=8,
             )
         )
 
diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
index 883d08d84ade..fdcab82146aa 100644
--- a/rllib/utils/minibatch_utils.py
+++ b/rllib/utils/minibatch_utils.py
@@ -9,19 +9,36 @@
 
 @DeveloperAPI
 class MiniBatchIteratorBase:
-    """The base class for all minibatch iterators.
-
-    Args:
-        batch: The input multi-agent batch.
-        minibatch_size: The size of the minibatch for each module_id.
-        num_iters: The number of epochs to cover. If the input batch is smaller than
-            minibatch_size, then the iterator will cycle through the batch until it
-            has covered num_iters epochs.
-    """
+    """The base class for all minibatch iterators."""
 
     def __init__(
-        self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int = 1
+        self,
+        batch: MultiAgentBatch,
+        *,
+        num_epochs: int = 1,
+        shuffle_batch_per_epoch: bool = True,
+        minibatch_size: int,
+        num_total_minibatches: int = 0,
     ) -> None:
+        """Initializes a MiniBatchIteratorBase instance.
+
+        Args:
+            batch: The input multi-agent batch.
+            num_epochs: The number of complete passes over the entire train batch. Each
+                pass might be further split into n minibatches (if `minibatch_size`
+                provided). The train batch is generated from the given `episodes`
+                through the Learner connector pipeline.
+            minibatch_size: The size of minibatches to use to further split the train
+                batch into per epoch. The train batch is generated from the given
+                `episodes` through the Learner connector pipeline.
+            num_total_minibatches: The total number of minibatches to loop through
+                (over all `num_epochs` epochs). It's only required to set this to != 0
+                in multi-agent + multi-GPU situations, in which the MultiAgentEpisodes
+                themselves are roughly sharded equally, however, they might contain
+                SingleAgentEpisodes with very lopsided length distributions. Thus,
+                without this fixed, pre-computed value, one Learner might go through a
+                different number of minibatche passes than others causing a deadlock.
+        """
         pass
 
 
@@ -29,58 +46,56 @@ def __init__(
 class MiniBatchCyclicIterator(MiniBatchIteratorBase):
     """This implements a simple multi-agent minibatch iterator.
 
-
     This iterator will split the input multi-agent batch into minibatches where the
     size of batch for each module_id (aka policy_id) is equal to minibatch_size. If the
     input batch is smaller than minibatch_size, then the iterator will cycle through
-    the batch until it has covered num_iters epochs.
-
-    Args:
-        batch: The input multi-agent batch.
-        minibatch_size: The size of the minibatch for each module_id.
-        num_iters: The minimum number of epochs to cover. If the input batch is smaller
-            than minibatch_size, then the iterator will cycle through the batch until
-            it has covered at least num_iters epochs.
+    the batch until it has covered `num_epochs` epochs.
     """
-
     def __init__(
         self,
         batch: MultiAgentBatch,
+        *,
+        num_epochs: int = 1,
+        shuffle_batch_per_epoch: bool = True,
         minibatch_size: int,
-        num_iters: int = 1,
-        uses_new_env_runners: bool = False,
-        num_total_mini_batches: int = 0,
-        shuffle: bool = False,
+        num_total_minibatches: int = 0,
+        _uses_new_env_runners: bool = False,
     ) -> None:
-        super().__init__(batch, minibatch_size, num_iters)
+        """Initializes a MiniBatchCyclicIterator instance."""
+        super().__init__(
+            batch,
+            num_epochs=num_epochs,
+            minibatch_size=minibatch_size,
+            shuffle_batch_per_epoch=shuffle_batch_per_epoch,
+        )
+
         self._batch = batch
         self._minibatch_size = minibatch_size
-        self._num_iters = num_iters
+        self._num_epochs = num_epochs
+        self._shuffle_batch_per_epoch = shuffle_batch_per_epoch
 
         # mapping from module_id to the start index of the batch
         self._start = {mid: 0 for mid in batch.policy_batches.keys()}
         # mapping from module_id to the number of epochs covered for each module_id
         self._num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()}
 
-        self._uses_new_env_runners = uses_new_env_runners
-
-        self._mini_batch_count = 0
-        self._num_total_mini_batches = num_total_mini_batches
+        self._uses_new_env_runners = _uses_new_env_runners
 
-        self._shuffle = shuffle
+        self._minibatch_count = 0
+        self._num_total_minibatches = num_total_minibatches
 
     def __iter__(self):
         while (
             # Make sure each item in the total batch gets at least iterated over
-            # `self._num_iters` times.
+            # `self._num_epochs` times.
             (
-                self._num_total_mini_batches == 0
-                and min(self._num_covered_epochs.values()) < self._num_iters
+                self._num_total_minibatches == 0
+                and min(self._num_covered_epochs.values()) < self._num_epochs
             )
             # Make sure we reach at least the given minimum number of mini-batches.
             or (
-                self._num_total_mini_batches > 0
-                and self._mini_batch_count < self._num_total_mini_batches
+                self._num_total_minibatches > 0
+                and self._minibatch_count < self._num_total_minibatches
             )
         ):
             minibatch = {}
@@ -89,7 +104,7 @@ def __iter__(self):
                 # Shuffle the individual single-agent batch, if required.
                 # This should happen once per minibatch iteration in order to make
                 # each iteration go through a different set of minibatches.
-                if self._shuffle:
+                if self._shuffle_batch_per_epoch:
                     module_batch.shuffle()
 
                 if len(module_batch) == 0:
@@ -166,12 +181,12 @@ def get_len(b):
             minibatch = MultiAgentBatch(minibatch, len(self._batch))
             yield minibatch
 
-            self._mini_batch_count += 1
+            self._minibatch_count += 1
 
 
 class MiniBatchDummyIterator(MiniBatchIteratorBase):
-    def __init__(self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int = 1):
-        super().__init__(batch, minibatch_size, num_iters)
+    def __init__(self, batch: MultiAgentBatch, **kwargs):
+        super().__init__(batch, **kwargs)
         self._batch = batch
 
     def __iter__(self):
diff --git a/rllib/utils/tests/test_minibatch_utils.py b/rllib/utils/tests/test_minibatch_utils.py
index cd5f3bbddf4d..879e8b522a1b 100644
--- a/rllib/utils/tests/test_minibatch_utils.py
+++ b/rllib/utils/tests/test_minibatch_utils.py
@@ -14,20 +14,20 @@
 tf1.enable_eager_execution()
 
 CONFIGS = [
-    {"mini_batch_size": 256, "num_sgd_iter": 30, "agent_steps": (1652, 1463)},
-    {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (1000, 2)},
-    {"mini_batch_size": 128, "num_sgd_iter": 3, "agent_steps": (56, 56)},
-    {"mini_batch_size": 128, "num_sgd_iter": 7, "agent_steps": (56, 56)},
-    {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 56)},
-    {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 3)},
-    {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 4)},
-    {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (56, 55)},
-    {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (400, 400)},
-    {"mini_batch_size": 128, "num_sgd_iter": 10, "agent_steps": (64, 64)},
+    {"minibatch_size": 256, "num_epochs": 30, "agent_steps": (1652, 1463)},
+    {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (1000, 2)},
+    {"minibatch_size": 128, "num_epochs": 3, "agent_steps": (56, 56)},
+    {"minibatch_size": 128, "num_epochs": 7, "agent_steps": (56, 56)},
+    {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 56)},
+    {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 3)},
+    {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 4)},
+    {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (56, 55)},
+    {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (400, 400)},
+    {"minibatch_size": 128, "num_epochs": 10, "agent_steps": (64, 64)},
     # W/ SEQ_LENS.
     {
-        "mini_batch_size": 64,
-        "num_sgd_iter": 1,
+        "minibatch_size": 64,
+        "num_epochs": 1,
         "agent_steps": (128,),
         "seq_lens": [16, 16, 16, 16, 16, 16, 2, 2, 14, 14],
         "padding": True,
@@ -39,8 +39,8 @@ class TestMinibatchUtils(unittest.TestCase):
     def test_minibatch_cyclic_iterator(self):
 
         for config in CONFIGS:
-            mini_batch_size = config["mini_batch_size"]
-            num_sgd_iter = config["num_sgd_iter"]
+            minibatch_size = config["minibatch_size"]
+            num_epochs = config["num_epochs"]
             agent_steps = config["agent_steps"]
             seq_lens = config.get("seq_lens")
             max_seq_len = None
@@ -85,7 +85,9 @@ def test_minibatch_cyclic_iterator(self):
                             )
 
                 mb = MultiAgentBatch(sample_batches, num_env_steps)
-                batch_iter = MiniBatchCyclicIterator(mb, mini_batch_size, num_sgd_iter)
+                batch_iter = MiniBatchCyclicIterator(
+                    mb, minibatch_size=minibatch_size, num_epochs=num_epochs
+                )
                 print(config)
                 iteration_counter = 0
                 for batch in batch_iter:
@@ -94,14 +96,14 @@ def test_minibatch_cyclic_iterator(self):
                     print(batch["pol0"]["obs"])
                     print("*" * 80)
                     # Check that for each policy the batch size is equal to the
-                    # mini_batch_size.
+                    # minibatch_size.
                     for policy_batch in batch.policy_batches.values():
-                        check(policy_batch.count, mini_batch_size)
+                        check(policy_batch.count, minibatch_size)
                     iteration_counter += 1
 
                 # For each policy check that the last item in batch matches the expected
-                # values, i.e. iteration_counter * mini_batch_size % agent_steps - 1.
-                total_steps = iteration_counter * mini_batch_size
+                # values, i.e. iteration_counter * minibatch_size % agent_steps - 1.
+                total_steps = iteration_counter * minibatch_size
                 for policy_idx, policy_batch in enumerate(
                     batch.policy_batches.values()
                 ):
@@ -111,9 +113,9 @@ def test_minibatch_cyclic_iterator(self):
                     check(policy_batch["obs"][-1], expected_last_item)
 
                 # Check iteration counter (should be
-                # ceil(num_gsd_iter * max(agent_steps) / mini_batch_size)).
+                # ceil(num_gsd_iter * max(agent_steps) / minibatch_size)).
                 expected_iteration_counter = np.ceil(
-                    num_sgd_iter * max(agent_steps) / mini_batch_size
+                    num_epochs * max(agent_steps) / minibatch_size
                 )
                 if not seq_lens:
                     check(iteration_counter, expected_iteration_counter)

From 38f0d99476c07dcb242405aef61d722574f46801 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 2 Sep 2024 16:36:06 +0200
Subject: [PATCH 03/20] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo.py             |  1 +
 rllib/algorithms/impala/impala.py         |  1 +
 rllib/algorithms/impala/impala_learner.py | 26 +++++++++++++++++++++++
 rllib/policy/sample_batch.py              |  3 +++
 4 files changed, 31 insertions(+)

diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py
index 73ceef6f3264..d2db78febbce 100644
--- a/rllib/algorithms/appo/appo.py
+++ b/rllib/algorithms/appo/appo.py
@@ -98,6 +98,7 @@ def __init__(self, algo_class=None):
         self.use_kl_loss = False
         self.kl_coeff = 1.0
         self.kl_target = 0.01
+        #self.shuffle_batch_per_epoch = True
 
         # Override some of IMPALAConfig's default values with APPO-specific values.
         self.num_env_runners = 2
diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index 9ad590f72f34..ea9da381b828 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -167,6 +167,7 @@ def __init__(self, algo_class=None):
         self._lr_vf = 0.0005  # @OldAPIstack
 
         # Override some of AlgorithmConfig's default values with IMPALA-specific values.
+        self.num_learners = 1
         self.rollout_fragment_length = 50
         self.train_batch_size = 500  # @OldAPIstack
         self.train_batch_size_per_learner = 500
diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py
index f6f6df0cdb1e..e0962226db48 100644
--- a/rllib/algorithms/impala/impala_learner.py
+++ b/rllib/algorithms/impala/impala_learner.py
@@ -11,6 +11,8 @@
 from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.learner.learner import Learner
+from ray.rllib.connectors.common import AddStatesFromEpisodesToBatch
+from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate
 from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
 from ray.rllib.utils.annotations import (
@@ -65,6 +67,10 @@ def build(self) -> None:
         # slots to mask out).
         if self.config.add_default_connectors_to_learner_pipeline:
             self._learner_connector.prepend(AddOneTsToEpisodesAndTruncate())
+            self._learner_connector.insert_after(
+                AddStatesFromEpisodesToBatch,
+                AddVTraceSeqLensNoRNN,
+            )
 
         # Create and start the GPU-loader thread. It picks up train-ready batches from
         # the "GPU-loader queue" and loads them to the GPU, then places the GPU batches
@@ -287,3 +293,23 @@ def step(self):
             self._out_queue.put(copy.deepcopy(results))
 
             self.metrics.log_value(QUEUE_SIZE_RESULTS_QUEUE, self._out_queue.qsize())
+
+
+class AddVTraceSeqLensNoRNN(ConnectorV2):
+    def __init__(
+        self,
+        input_observation_space=None,
+        input_action_space=None,
+        *,
+        rollout_fragment_length: int,
+        **kwargs,
+    ):
+        super().__init__(input_observation_space, input_action_space, **kwargs)
+        self._rollout_fragment_length = rollout_fragment_length
+
+    @override(ConnectorV2)
+    def __call__(self, *, rl_module, batch, episodes):
+        if Columns.SEQ_LENS not in batch:
+            pass
+            TODO  # Continue implementing here
+        return batch
diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py
index 098ddc2218ad..33a0b5eea25b 100644
--- a/rllib/policy/sample_batch.py
+++ b/rllib/policy/sample_batch.py
@@ -483,7 +483,10 @@ def shuffle(self) -> "SampleBatch":
             permutation = np.random.permutation(len(self[SampleBatch.SEQ_LENS]))
 
         self_as_dict = dict(self)
+        infos = self_as_dict.pop(Columns.INFOS, None)
         shuffled = tree.map_structure(lambda v: v[permutation], self_as_dict)
+        if infos is not None:
+            self_as_dict[Columns.INFOS] = [infos[i] for i in permutation]
 
         self.update(shuffled)
 

From ea8075f793ad49d2afc6af869ea5e6abc8d5f7f2 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 2 Sep 2024 19:47:14 +0200
Subject: [PATCH 04/20] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/tuned_examples/ppo/cartpole_ppo.py | 23 ++++-------------------
 rllib/utils/minibatch_utils.py           | 11 +++++------
 2 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/rllib/tuned_examples/ppo/cartpole_ppo.py b/rllib/tuned_examples/ppo/cartpole_ppo.py
index 612f267f188a..9d8f09d43e06 100644
--- a/rllib/tuned_examples/ppo/cartpole_ppo.py
+++ b/rllib/tuned_examples/ppo/cartpole_ppo.py
@@ -1,13 +1,9 @@
 from ray.rllib.algorithms.ppo import PPOConfig
-from ray.rllib.utils.metrics import (
-    ENV_RUNNER_RESULTS,
-    EPISODE_RETURN_MEAN,
-    EVALUATION_RESULTS,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
-)
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
-parser = add_rllib_example_script_args()
+parser = add_rllib_example_script_args(
+    default_reward=450.0, default_timesteps=300000
+)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.
@@ -35,21 +31,10 @@
         vf_loss_coeff=0.01,
         use_kl_loss=True,
     )
-    .evaluation(
-        evaluation_num_env_runners=1,
-        evaluation_interval=1,
-        evaluation_parallel_to_training=True,
-        evaluation_config=PPOConfig.overrides(exploration=False),
-    )
 )
 
-stop = {
-    f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000,
-    f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 350.0,
-}
-
 
 if __name__ == "__main__":
     from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
 
-    run_rllib_example_script_experiment(config, args, stop=stop)
+    run_rllib_example_script_experiment(config, args)
diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
index fdcab82146aa..cad11ddac9aa 100644
--- a/rllib/utils/minibatch_utils.py
+++ b/rllib/utils/minibatch_utils.py
@@ -101,12 +101,6 @@ def __iter__(self):
             minibatch = {}
             for module_id, module_batch in self._batch.policy_batches.items():
 
-                # Shuffle the individual single-agent batch, if required.
-                # This should happen once per minibatch iteration in order to make
-                # each iteration go through a different set of minibatches.
-                if self._shuffle_batch_per_epoch:
-                    module_batch.shuffle()
-
                 if len(module_batch) == 0:
                     raise ValueError(
                         f"The batch for module_id {module_id} is empty! "
@@ -164,6 +158,11 @@ def get_len(b):
                     n_steps -= len_sample
                     s = 0
                     self._num_covered_epochs[module_id] += 1
+                    # Shuffle the individual single-agent batch, if required.
+                    # This should happen once per minibatch iteration in order to make
+                    # each iteration go through a different set of minibatches.
+                    if self._shuffle_batch_per_epoch:
+                        module_batch.shuffle()
 
                 e = s + n_steps  # end
                 if e > s:

From 4e1e42eb1993d98c6ac7ac3330bd859ac175a79b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 3 Sep 2024 10:36:15 +0200
Subject: [PATCH 05/20] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/impala/impala.py          | 45 +++++++------------
 rllib/core/learner/learner.py              | 31 ++++++++++---
 rllib/core/learner/learner_group.py        | 51 ++++++++++++++++------
 rllib/tuned_examples/appo/cartpole_appo.py |  2 +-
 rllib/tuned_examples/ppo/cartpole_ppo.py   | 13 +-----
 rllib/utils/minibatch_utils.py             |  1 +
 6 files changed, 83 insertions(+), 60 deletions(-)

diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index ea9da381b828..a6c69876aaf9 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -171,7 +171,6 @@ def __init__(self, algo_class=None):
         self.rollout_fragment_length = 50
         self.train_batch_size = 500  # @OldAPIstack
         self.train_batch_size_per_learner = 500
-        #self._minibatch_size = "auto"
         self.num_env_runners = 2
         self.num_gpus = 1  # @OldAPIstack
         self.lr = 0.0005
@@ -436,21 +435,21 @@ def validate(self) -> None:
                 "config.training(_tf_policy_handles_more_than_one_loss=True)."
             )
         # Learner API specific checks.
-        #if (
-        #    self.enable_rl_module_and_learner
-        #    and self._minibatch_size != "auto"
-        #    and not (
-        #        (self.minibatch_size % self.rollout_fragment_length == 0)
-        #        and self.minibatch_size <= self.total_train_batch_size
-        #    )
-        #):
-        #    raise ValueError(
-        #        f"`minibatch_size` ({self._minibatch_size}) must either be 'auto' "
-        #        "or a multiple of `rollout_fragment_length` "
-        #        f"({self.rollout_fragment_length}) while at the same time smaller "
-        #        "than or equal to `total_train_batch_size` "
-        #        f"({self.total_train_batch_size})!"
-        #    )
+        if (
+            self.enable_rl_module_and_learner
+            and self.minibatch_size is not None
+            and not (
+                (self.minibatch_size % self.rollout_fragment_length == 0)
+                and self.minibatch_size <= self.total_train_batch_size
+            )
+        ):
+            raise ValueError(
+                f"`minibatch_size` ({self._minibatch_size}) must either be None "
+                "or a multiple of `rollout_fragment_length` "
+                f"({self.rollout_fragment_length}) while at the same time smaller "
+                "than or equal to `total_train_batch_size` "
+                f"({self.total_train_batch_size})!"
+            )
 
     @property
     def replay_ratio(self) -> float:
@@ -460,20 +459,6 @@ def replay_ratio(self) -> float:
         """
         return (1 / self.replay_proportion) if self.replay_proportion > 0 else 0.0
 
-    #@property
-    #def minibatch_size(self):
-    #    # If 'auto', use the train_batch_size (meaning each SGD iter is a single pass
-    #    # through the entire train batch). Otherwise, use user provided setting.
-    #    return (
-    #        (
-    #            self.train_batch_size_per_learner
-    #            if self.enable_env_runner_and_connector_v2
-    #            else self.train_batch_size
-    #        )
-    #        if self._minibatch_size == "auto"
-    #        else self._minibatch_size
-    #    )
-
     @override(AlgorithmConfig)
     def get_default_learner_class(self):
         if self.framework_str == "torch":
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index b5873ac7e826..8e8247741ebd 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -927,6 +927,7 @@ def update_from_batch(
         timesteps: Optional[Dict[str, Any]] = None,
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
+        shuffle_batch_per_epoch: bool = True,
         # Deprecated args.
         num_iters=DEPRECATED_VALUE,
     ) -> ResultDict:
@@ -946,7 +947,14 @@ def update_from_batch(
                 provided). The train batch is generated from the given `episodes`
                 through the Learner connector pipeline.
             minibatch_size: The size of minibatches to use to further split the train
-                batch into.
+                `batch` into sub-batches. The `batch` is then iterated over n times
+                where n is `len(batch) // minibatch_size`.
+            shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
+                If the train batch has a time rank (axis=1), shuffling will only take
+                place along the batch axis to not disturb any intact (episode)
+                trajectories. Also, shuffling is always skipped if `minibatch_size` is
+                None, meaning the entire train batch is processed each epoch, making it
+                unnecessary to shuffle.
 
         Returns:
             A `ResultDict` object produced by a call to `self.metrics.reduce()`. The
@@ -966,6 +974,7 @@ def update_from_batch(
             timesteps=timesteps,
             num_epochs=num_epochs,
             minibatch_size=minibatch_size,
+            shuffle_batch_per_epoch=shuffle_batch_per_epoch,
         )
 
     def update_from_episodes(
@@ -977,6 +986,7 @@ def update_from_episodes(
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
         num_total_minibatches: int = 0,
+        shuffle_batch_per_epoch: bool = True,
         # Deprecated args.
         num_iters=DEPRECATED_VALUE,
     ) -> ResultDict:
@@ -996,8 +1006,16 @@ def update_from_episodes(
                 provided). The train batch is generated from the given `episodes`
                 through the Learner connector pipeline.
             minibatch_size: The size of minibatches to use to further split the train
-                batch into. The train batch is generated from the given `episodes`
-                through the Learner connector pipeline.
+                `batch` into sub-batches. The `batch` is then iterated over n times
+                where n is `len(batch) // minibatch_size`. The train batch is generated
+                from the given `episodes` through the Learner connector pipeline.
+            shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
+                If the train batch has a time rank (axis=1), shuffling will only take
+                place along the batch axis to not disturb any intact (episode)
+                trajectories. Also, shuffling is always skipped if `minibatch_size` is
+                None, meaning the entire train batch is processed each epoch, making it
+                unnecessary to shuffle. The train batch is generated from the given
+                `episodes` through the Learner connector pipeline.
             num_total_minibatches: The total number of minibatches to loop through
                 (over all `num_epochs` epochs). It's only required to set this to != 0
                 in multi-agent + multi-GPU situations, in which the MultiAgentEpisodes
@@ -1024,6 +1042,7 @@ def update_from_episodes(
             timesteps=timesteps,
             minibatch_size=minibatch_size,
             num_epochs=num_epochs,
+            shuffle_batch_per_epoch=shuffle_batch_per_epoch,
             num_total_minibatches=num_total_minibatches,
         )
 
@@ -1037,7 +1056,7 @@ def update_from_iterator(
         **kwargs,
     ):
         self._check_is_built()
-        #minibatch_size = minibatch_size or 32
+        # minibatch_size = minibatch_size or 32
 
         # Call `before_gradient_based_update` to allow for non-gradient based
         # preparations-, logging-, and update logic to happen.
@@ -1294,7 +1313,9 @@ def _update_from_batch_or_episodes(
 
         if minibatch_size:
             if self._learner_connector is not None:
-                batch_iter = partial(MiniBatchCyclicIterator, _uses_new_env_runners=True)
+                batch_iter = partial(
+                    MiniBatchCyclicIterator, _uses_new_env_runners=True
+                )
             else:
                 batch_iter = MiniBatchCyclicIterator
         elif num_epochs > 1:
diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py
index d746265c9b23..48bd5628b6ef 100644
--- a/rllib/core/learner/learner_group.py
+++ b/rllib/core/learner/learner_group.py
@@ -222,7 +222,7 @@ def update_from_batch(
         async_update: bool = False,
         return_state: bool = False,
         num_epochs: int = 1,
-        shuffle_batch_per_epoch: bool = False,
+        shuffle_batch_per_epoch: bool = True,
         minibatch_size: Optional[int] = None,
         # User kwargs.
         **kwargs,
@@ -243,9 +243,18 @@ def update_from_batch(
                 Learner workers' states should be identical, so we use the first
                 Learner's state here. Useful for avoiding an extra `get_weights()` call,
                 e.g. for synchronizing EnvRunner weights.
-            minibatch_size: The minibatch size to use for the update.
-            num_iters: The number of complete passes over all the sub-batches in the
-                input multi-agent batch.
+            num_epochs: The number of complete passes over the entire train batch. Each
+                pass might be further split into n minibatches (if `minibatch_size`
+                provided).
+            minibatch_size: The size of minibatches to use to further split the train
+                `batch` into sub-batches. The `batch` is then iterated over n times
+                where n is `len(batch) // minibatch_size`.
+            shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
+                If the train batch has a time rank (axis=1), shuffling will only take
+                place along the batch axis to not disturb any intact (episode)
+                trajectories. Also, shuffling is always skipped if `minibatch_size` is
+                None, meaning the entire train batch is processed each epoch, making it
+                unnecessary to shuffle.
 
         Returns:
             If `async_update` is False, a dictionary with the reduced results of the
@@ -262,8 +271,9 @@ def update_from_batch(
             timesteps=timesteps,
             async_update=async_update,
             return_state=return_state,
-            minibatch_size=minibatch_size,
             num_epochs=num_epochs,
+            minibatch_size=minibatch_size,
+            shuffle_batch_per_epoch=shuffle_batch_per_epoch,
             **kwargs,
         )
 
@@ -275,8 +285,8 @@ def update_from_episodes(
         async_update: bool = False,
         return_state: bool = False,
         num_epochs: int = 1,
-        shuffle_batch_per_epoch: bool = False,
         minibatch_size: Optional[int] = None,
+        shuffle_batch_per_epoch: bool = True,
         # User kwargs.
         **kwargs,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
@@ -296,9 +306,21 @@ def update_from_episodes(
                 Learner workers' states should be identical, so we use the first
                 Learner's state here. Useful for avoiding an extra `get_weights()` call,
                 e.g. for synchronizing EnvRunner weights.
-            minibatch_size: The minibatch size to use for the update.
-            num_iters: The number of complete passes over all the sub-batches in the
-                input multi-agent batch.
+            num_epochs: The number of complete passes over the entire train batch. Each
+                pass might be further split into n minibatches (if `minibatch_size`
+                provided). The train batch is generated from the given `episodes`
+                through the Learner connector pipeline.
+            minibatch_size: The size of minibatches to use to further split the train
+                `batch` into sub-batches. The `batch` is then iterated over n times
+                where n is `len(batch) // minibatch_size`. The train batch is generated
+                from the given `episodes` through the Learner connector pipeline.
+            shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
+                If the train batch has a time rank (axis=1), shuffling will only take
+                place along the batch axis to not disturb any intact (episode)
+                trajectories. Also, shuffling is always skipped if `minibatch_size` is
+                None, meaning the entire train batch is processed each epoch, making it
+                unnecessary to shuffle. The train batch is generated from the given
+                `episodes` through the Learner connector pipeline.
 
         Returns:
             If async_update is False, a dictionary with the reduced results of the
@@ -315,8 +337,9 @@ def update_from_episodes(
             timesteps=timesteps,
             async_update=async_update,
             return_state=return_state,
-            minibatch_size=minibatch_size,
             num_epochs=num_epochs,
+            minibatch_size=minibatch_size,
+            shuffle_batch_per_epoch=shuffle_batch_per_epoch,
             **kwargs,
         )
 
@@ -330,7 +353,7 @@ def _update(
         return_state: bool = False,
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        shuffle_batch_per_epoch: bool = False,
+        shuffle_batch_per_epoch: bool = True,
         # Deprecated args.
         num_iters=DEPRECATED_VALUE,
         **kwargs,
@@ -365,16 +388,18 @@ def _learner_update(
                 result = _learner.update_from_batch(
                     batch=_batch_shard,
                     timesteps=_timesteps,
-                    minibatch_size=minibatch_size,
                     num_epochs=num_epochs,
+                    minibatch_size=minibatch_size,
+                    shuffle_batch_per_epoch=shuffle_batch_per_epoch,
                     **_kwargs,
                 )
             else:
                 result = _learner.update_from_episodes(
                     episodes=_episodes_shard,
                     timesteps=_timesteps,
-                    minibatch_size=minibatch_size,
                     num_epochs=num_epochs,
+                    minibatch_size=minibatch_size,
+                    shuffle_batch_per_epoch=shuffle_batch_per_epoch,
                     num_total_minibatches=_num_total_minibatches,
                     **_kwargs,
                 )
diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py
index 865c4ce85c31..e8ffd6cff4f9 100644
--- a/rllib/tuned_examples/appo/cartpole_appo.py
+++ b/rllib/tuned_examples/appo/cartpole_appo.py
@@ -35,7 +35,7 @@
     .rl_module(
         model_config_dict={
             "fcnet_hiddens": [32],
-            #"vf_share_layers": True,
+            # "vf_share_layers": True,
             "uses_new_env_runners": True,
         },
     )
diff --git a/rllib/tuned_examples/ppo/cartpole_ppo.py b/rllib/tuned_examples/ppo/cartpole_ppo.py
index 27d931db04c6..18f11b9d8ffa 100644
--- a/rllib/tuned_examples/ppo/cartpole_ppo.py
+++ b/rllib/tuned_examples/ppo/cartpole_ppo.py
@@ -1,9 +1,7 @@
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
-parser = add_rllib_example_script_args(
-    default_reward=450.0, default_timesteps=300000
-)
+parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.
@@ -14,7 +12,7 @@
     .environment("CartPole-v1")
     .training(
         lr=0.0003,
-        num_sgd_iter=6,
+        num_epochs=6,
         vf_loss_coeff=0.01,
     )
     .rl_module(
@@ -24,13 +22,6 @@
             "vf_share_layers": True,
         }
     )
-    .training(
-        gamma=0.99,
-        lr=0.0003,
-        num_epochs=6,
-        vf_loss_coeff=0.01,
-        use_kl_loss=True,
-    )
 )
 
 
diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
index cad11ddac9aa..f07b7f23e64f 100644
--- a/rllib/utils/minibatch_utils.py
+++ b/rllib/utils/minibatch_utils.py
@@ -51,6 +51,7 @@ class MiniBatchCyclicIterator(MiniBatchIteratorBase):
     input batch is smaller than minibatch_size, then the iterator will cycle through
     the batch until it has covered `num_epochs` epochs.
     """
+
     def __init__(
         self,
         batch: MultiAgentBatch,

From 61c3f2080adace4a51ccaf24188d3a3baf2afe7c Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 3 Sep 2024 11:23:18 +0200
Subject: [PATCH 06/20] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo.py              |  5 +++-
 rllib/algorithms/impala/impala_learner.py  | 27 +---------------------
 rllib/algorithms/ppo/ppo.py                |  9 ++++----
 rllib/core/learner/learner.py              |  6 ++---
 rllib/core/learner/learner_group.py        |  6 ++---
 rllib/tuned_examples/appo/cartpole_appo.py | 14 ++---------
 rllib/tuned_examples/ppo/pendulum_ppo.py   |  5 ----
 7 files changed, 18 insertions(+), 54 deletions(-)

diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py
index d2db78febbce..ad21a9780dc3 100644
--- a/rllib/algorithms/appo/appo.py
+++ b/rllib/algorithms/appo/appo.py
@@ -98,7 +98,10 @@ def __init__(self, algo_class=None):
         self.use_kl_loss = False
         self.kl_coeff = 1.0
         self.kl_target = 0.01
-        #self.shuffle_batch_per_epoch = True
+        # TODO (sven): Activate once v-trace sequences in non-RNN batch are solved.
+        #  If we switch this on right now, the shuffling would destroy the rollout
+        #  sequences (non-zero-padded!) needed in the batch for v-trace.
+        # self.shuffle_batch_per_epoch = True
 
         # Override some of IMPALAConfig's default values with APPO-specific values.
         self.num_env_runners = 2
diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py
index a6caea1b0fe7..6c40c79af17f 100644
--- a/rllib/algorithms/impala/impala_learner.py
+++ b/rllib/algorithms/impala/impala_learner.py
@@ -11,8 +11,6 @@
 from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.learner.learner import Learner
-from ray.rllib.connectors.common import AddStatesFromEpisodesToBatch
-from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate
 from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
 from ray.rllib.utils.annotations import (
@@ -70,10 +68,6 @@ def build(self) -> None:
             and self.config.add_default_connectors_to_learner_pipeline
         ):
             self._learner_connector.prepend(AddOneTsToEpisodesAndTruncate())
-            self._learner_connector.insert_after(
-                AddStatesFromEpisodesToBatch,
-                AddVTraceSeqLensNoRNN,
-            )
 
         # Create and start the GPU-loader thread. It picks up train-ready batches from
         # the "GPU-loader queue" and loads them to the GPU, then places the GPU batches
@@ -118,6 +112,7 @@ def update_from_episodes(
         #  algos that actually need (and know how) to do minibatching.
         minibatch_size: Optional[int] = None,
         num_epochs: int = 1,
+        shuffle_batch_per_epoch: bool = False,
         num_total_minibatches: int = 0,
         reduce_fn=None,  # Deprecated args.
         **kwargs,
@@ -296,23 +291,3 @@ def step(self):
             self._out_queue.put(copy.deepcopy(results))
 
             self.metrics.log_value(QUEUE_SIZE_RESULTS_QUEUE, self._out_queue.qsize())
-
-
-class AddVTraceSeqLensNoRNN(ConnectorV2):
-    def __init__(
-        self,
-        input_observation_space=None,
-        input_action_space=None,
-        *,
-        rollout_fragment_length: int,
-        **kwargs,
-    ):
-        super().__init__(input_observation_space, input_action_space, **kwargs)
-        self._rollout_fragment_length = rollout_fragment_length
-
-    @override(ConnectorV2)
-    def __call__(self, *, rl_module, batch, episodes):
-        if Columns.SEQ_LENS not in batch:
-            pass
-            TODO  # Continue implementing here
-        return batch
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 40efe71a5b9f..558983875df6 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -130,17 +130,17 @@ def __init__(self, algo_class=None):
         self.lr = 5e-5
         self.rollout_fragment_length = "auto"
         self.train_batch_size = 4000
-        self.shuffle_batch_per_epoch = True
 
         # PPO specific settings:
         self.use_critic = True
         self.use_gae = True
+        self.num_epochs = 30
+        self.minibatch_size = 128
+        self.shuffle_batch_per_epoch = True
         self.lambda_ = 1.0
         self.use_kl_loss = True
         self.kl_coeff = 0.2
         self.kl_target = 0.01
-        self.minibatch_size = 128
-        self.num_epochs = 30
         self.vf_loss_coeff = 1.0
         self.entropy_coeff = 0.0
         self.entropy_coeff_schedule = None
@@ -467,8 +467,9 @@ def _training_step_new_api_stack(self) -> ResultDict:
                         self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME)
                     ),
                 },
-                minibatch_size=self.config.minibatch_size,
                 num_epochs=self.config.num_epochs,
+                minibatch_size=self.config.minibatch_size,
+                shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch,
             )
             self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
             self.metrics.log_dict(
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index 8e8247741ebd..c9e6aed75637 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -927,7 +927,7 @@ def update_from_batch(
         timesteps: Optional[Dict[str, Any]] = None,
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        shuffle_batch_per_epoch: bool = True,
+        shuffle_batch_per_epoch: bool = False,
         # Deprecated args.
         num_iters=DEPRECATED_VALUE,
     ) -> ResultDict:
@@ -986,7 +986,7 @@ def update_from_episodes(
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
         num_total_minibatches: int = 0,
-        shuffle_batch_per_epoch: bool = True,
+        shuffle_batch_per_epoch: bool = False,
         # Deprecated args.
         num_iters=DEPRECATED_VALUE,
     ) -> ResultDict:
@@ -1245,7 +1245,7 @@ def _update_from_batch_or_episodes(
         #  algos that actually need (and know how) to do minibatching.
         minibatch_size: Optional[int] = None,
         num_epochs: int = 1,
-        shuffle_batch_per_epoch: bool = True,
+        shuffle_batch_per_epoch: bool = False,
         num_total_minibatches: int = 0,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
 
diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py
index 48bd5628b6ef..07492a8b5611 100644
--- a/rllib/core/learner/learner_group.py
+++ b/rllib/core/learner/learner_group.py
@@ -222,7 +222,7 @@ def update_from_batch(
         async_update: bool = False,
         return_state: bool = False,
         num_epochs: int = 1,
-        shuffle_batch_per_epoch: bool = True,
+        shuffle_batch_per_epoch: bool = False,
         minibatch_size: Optional[int] = None,
         # User kwargs.
         **kwargs,
@@ -286,7 +286,7 @@ def update_from_episodes(
         return_state: bool = False,
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        shuffle_batch_per_epoch: bool = True,
+        shuffle_batch_per_epoch: bool = False,
         # User kwargs.
         **kwargs,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
@@ -353,7 +353,7 @@ def _update(
         return_state: bool = False,
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        shuffle_batch_per_epoch: bool = True,
+        shuffle_batch_per_epoch: bool = False,
         # Deprecated args.
         num_iters=DEPRECATED_VALUE,
         **kwargs,
diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py
index e8ffd6cff4f9..6a2ccf143464 100644
--- a/rllib/tuned_examples/appo/cartpole_appo.py
+++ b/rllib/tuned_examples/appo/cartpole_appo.py
@@ -1,9 +1,4 @@
 from ray.rllib.algorithms.appo import APPOConfig
-from ray.rllib.utils.metrics import (
-    ENV_RUNNER_RESULTS,
-    EPISODE_RETURN_MEAN,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
-)
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
 parser = add_rllib_example_script_args(
@@ -25,17 +20,12 @@
     )
     .environment("CartPole-v1")
     .training(
-        train_batch_size_per_learner=1000,
         vf_loss_coeff=0.05,
-        entropy_coeff=0.01,
-        num_epochs=2,
-        lr=0.00075,
-        minibatch_size=250,
+        entropy_coeff=0.0,
     )
     .rl_module(
         model_config_dict={
-            "fcnet_hiddens": [32],
-            # "vf_share_layers": True,
+            "vf_share_layers": True,
             "uses_new_env_runners": True,
         },
     )
diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py
index aa0c5d1027b5..5df6e3e78855 100644
--- a/rllib/tuned_examples/ppo/pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/pendulum_ppo.py
@@ -9,11 +9,6 @@
 
 config = (
     PPOConfig()
-    # Enable new API stack and use EnvRunner.
-    .api_stack(
-        enable_rl_module_and_learner=True,
-        enable_env_runner_and_connector_v2=True,
-    )
     .env_runners(
         num_env_runners=2,
         num_envs_per_env_runner=10,

From a20f44c937c7631f93874b152167cbc3078431df Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 3 Sep 2024 13:02:30 +0200
Subject: [PATCH 07/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/policy/torch_policy.py    | 10 +++++++---
 rllib/policy/torch_policy_v2.py | 11 +++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index e4db6d37a5c0..5abd0c9922f8 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -548,9 +548,13 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
 
         # Get the correct slice of the already loaded batch to use,
         # based on offset and batch size.
-        device_batch_size = self.config.get(
-            "minibatch_size", self.config["train_batch_size"]
-        ) // len(self.devices)
+        device_batch_size = self.config.get("minibatch_size")
+        if device_batch_size is None:
+            device_batch_size = self.config.get(
+                "sgd_minibatch_size",
+                self.config["train_batch_size"],
+            )
+        device_batch_size //= len(self.devices)
 
         # Set Model to train mode.
         if self.model_gpu_towers:
diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index 649fc19f88e3..a61116fb712c 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -838,10 +838,13 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
 
         # Get the correct slice of the already loaded batch to use,
         # based on offset and batch size.
-        device_batch_size = self.config.get(
-            "minibatch_size",
-            self.config.get("sgd_minibatch_size", self.config["train_batch_size"]),
-        ) // len(self.devices)
+        device_batch_size = self.config.get("minibatch_size")
+        if device_batch_size is None:
+            device_batch_size = self.config.get(
+                "sgd_minibatch_size",
+                self.config["train_batch_size"],
+            )
+        device_batch_size //= len(self.devices)
 
         # Set Model to train mode.
         if self.model_gpu_towers:

From b966d998dd6f0e80f4c019a3c6ded43bf4f8a7b1 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 3 Sep 2024 14:20:22 +0200
Subject: [PATCH 08/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py               | 12 ++++++------
 .../common/add_states_from_episodes_to_batch.py    |  9 +++++++++
 rllib/core/learner/learner.py                      | 14 +++++++++-----
 rllib/core/learner/learner_group.py                |  2 +-
 rllib/utils/minibatch_utils.py                     |  2 +-
 5 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 233469a5c429..6eb78b4a6532 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -386,8 +386,8 @@ def __init__(self, algo_class: Optional[type] = None):
         # These setting have been adopted from the original PPO batch settings:
         # num_sgd_iter, minibatch_size, and shuffle_sequences.
         self.num_epochs = 1
-        self.shuffle_batch_per_epoch = False
         self.minibatch_size = None
+        self.shuffle_batch_per_epoch = False
 
         # TODO (sven): Unsolved problem with RLModules sometimes requiring settings from
         #  the main AlgorithmConfig. We should not require the user to provide those
@@ -2053,8 +2053,8 @@ def training(
         train_batch_size: Optional[int] = NotProvided,
         train_batch_size_per_learner: Optional[int] = NotProvided,
         num_epochs: Optional[int] = NotProvided,
-        shuffle_batch_per_epoch: Optional[bool] = NotProvided,
         minibatch_size: Optional[int] = NotProvided,
+        shuffle_batch_per_epoch: Optional[bool] = NotProvided,
         model: Optional[dict] = NotProvided,
         optimizer: Optional[dict] = NotProvided,
         max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided,
@@ -2116,12 +2116,12 @@ def training(
             num_epochs: The number of complete passes over the entire train batch (per
                 Learner). Each pass might be further split into n minibatches (if
                 `minibatch_size` provided).
+            minibatch_size: The size of minibatches to use to further split the train
+                batch into.
             shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
                 If the train batch has a time rank (axis=1), shuffling will only take
                 place along the batch axis to not disturb any intact (episode)
                 trajectories.
-            minibatch_size: The size of minibatches to use to further split the train
-                batch into.
             model: Arguments passed into the policy model. See models/catalog.py for a
                 full list of the available model options.
                 TODO: Provide ModelConfig objects instead of dicts.
@@ -2187,10 +2187,10 @@ def training(
             self.train_batch_size = train_batch_size
         if num_epochs is not NotProvided:
             self.num_epochs = num_epochs
-        if shuffle_batch_per_epoch is not NotProvided:
-            self.shuffle_batch_per_epoch = shuffle_batch_per_epoch
         if minibatch_size is not NotProvided:
             self.minibatch_size = minibatch_size
+        if shuffle_batch_per_epoch is not NotProvided:
+            self.shuffle_batch_per_epoch = shuffle_batch_per_epoch
 
         if model is not NotProvided:
             self.model.update(model)
diff --git a/rllib/connectors/common/add_states_from_episodes_to_batch.py b/rllib/connectors/common/add_states_from_episodes_to_batch.py
index 2c62466d84ab..e4e5bfa2641a 100644
--- a/rllib/connectors/common/add_states_from_episodes_to_batch.py
+++ b/rllib/connectors/common/add_states_from_episodes_to_batch.py
@@ -266,6 +266,15 @@ def __call__(
                         item_list,
                         max_seq_len=self._get_max_seq_len(rl_module, module_id=mid),
                     )
+                    # TODO (sven): Remove this hint/hack once we are not relying on
+                    #  SampleBatch anymore (which has to set its property
+                    #  zero_padded=True when shuffling).
+                    shared_data[
+                        (
+                            "_zero_padded_for_mid="
+                            f"{mid if mid is not None else DEFAULT_MODULE_ID}"
+                        )
+                    ] = True
 
         for sa_episode in self.single_agent_episode_iterator(
             episodes,
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index c9e6aed75637..db3bfaa1eab8 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -985,8 +985,8 @@ def update_from_episodes(
         timesteps: Optional[Dict[str, Any]] = None,
         num_epochs: int = 1,
         minibatch_size: Optional[int] = None,
-        num_total_minibatches: int = 0,
         shuffle_batch_per_epoch: bool = False,
+        num_total_minibatches: int = 0,
         # Deprecated args.
         num_iters=DEPRECATED_VALUE,
     ) -> ResultDict:
@@ -1040,8 +1040,8 @@ def update_from_episodes(
         return self._update_from_batch_or_episodes(
             episodes=episodes,
             timesteps=timesteps,
-            minibatch_size=minibatch_size,
             num_epochs=num_epochs,
+            minibatch_size=minibatch_size,
             shuffle_batch_per_epoch=shuffle_batch_per_epoch,
             num_total_minibatches=num_total_minibatches,
         )
@@ -1243,8 +1243,8 @@ def _update_from_batch_or_episodes(
         timesteps: Optional[Dict[str, Any]] = None,
         # TODO (sven): Deprecate these in favor of config attributes for only those
         #  algos that actually need (and know how) to do minibatching.
-        minibatch_size: Optional[int] = None,
         num_epochs: int = 1,
+        minibatch_size: Optional[int] = None,
         shuffle_batch_per_epoch: bool = False,
         num_total_minibatches: int = 0,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
@@ -1280,7 +1280,11 @@ def _update_from_batch_or_episodes(
                 # TODO (sven): Try to not require MultiAgentBatch anymore.
                 batch = MultiAgentBatch(
                     {
-                        module_id: SampleBatch(module_data)
+                        module_id: (
+                            SampleBatch(module_data, _zero_padded=True)
+                            if shared_data.get(f"_zero_padded_for_mid={module_id}")
+                            else SampleBatch(module_data)
+                        )
                         for module_id, module_data in batch.items()
                     },
                     env_steps=sum(len(e) for e in episodes),
@@ -1340,8 +1344,8 @@ def _update_from_batch_or_episodes(
 
         for tensor_minibatch in batch_iter(
             batch,
-            minibatch_size=minibatch_size,
             num_epochs=num_epochs,
+            minibatch_size=minibatch_size,
             shuffle_batch_per_epoch=shuffle_batch_per_epoch and (num_epochs > 1),
             num_total_minibatches=num_total_minibatches,
         ):
diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py
index 07492a8b5611..eb8d2a3cb05d 100644
--- a/rllib/core/learner/learner_group.py
+++ b/rllib/core/learner/learner_group.py
@@ -222,8 +222,8 @@ def update_from_batch(
         async_update: bool = False,
         return_state: bool = False,
         num_epochs: int = 1,
-        shuffle_batch_per_epoch: bool = False,
         minibatch_size: Optional[int] = None,
+        shuffle_batch_per_epoch: bool = False,
         # User kwargs.
         **kwargs,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
index f07b7f23e64f..e27b5a7782ba 100644
--- a/rllib/utils/minibatch_utils.py
+++ b/rllib/utils/minibatch_utils.py
@@ -57,8 +57,8 @@ def __init__(
         batch: MultiAgentBatch,
         *,
         num_epochs: int = 1,
-        shuffle_batch_per_epoch: bool = True,
         minibatch_size: int,
+        shuffle_batch_per_epoch: bool = True,
         num_total_minibatches: int = 0,
         _uses_new_env_runners: bool = False,
     ) -> None:

From 42535d49eba34217a7cd846a4c8e529e601fddba Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 4 Sep 2024 10:47:14 +0200
Subject: [PATCH 09/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/execution/train_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py
index bf930a00f5e2..5c207ced3a27 100644
--- a/rllib/execution/train_ops.py
+++ b/rllib/execution/train_ops.py
@@ -46,7 +46,9 @@ def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict:
     workers = algorithm.env_runner_group
     local_worker = workers.local_env_runner
     num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1))
-    minibatch_size = config.get("minibatch_size", config.get("sgd_minibatch_size", 0))
+    minibatch_size = config.get("minibatch_size")
+    if minibatch_size is None:
+        minibatch_size = config.get("sgd_minibatch_size", 0)
 
     learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER]
     with learn_timer:

From 292c71fda65e1ac970c4574dc3717288b1355b75 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 4 Sep 2024 11:44:40 +0200
Subject: [PATCH 10/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/execution/train_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py
index 5c207ced3a27..2b2b76bc671e 100644
--- a/rllib/execution/train_ops.py
+++ b/rllib/execution/train_ops.py
@@ -117,7 +117,9 @@ def multi_gpu_train_one_step(algorithm, train_batch) -> Dict:
     workers = algorithm.env_runner_group
     local_worker = workers.local_env_runner
     num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1))
-    minibatch_size = config.get("minibatch_size", config["train_batch_size"])
+    minibatch_size = config.get("minibatch_size")
+    if minibatch_size is None:
+        minibatch_size = config["train_batch_size"]
 
     # Determine the number of devices (GPUs or 1 CPU) we use.
     num_devices = int(math.ceil(config["num_gpus"] or 1))

From 1f748f10c36ef6470bb611f4754c9277d719103d Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 4 Sep 2024 12:51:21 +0200
Subject: [PATCH 11/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/utils/tests/test_minibatch_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/rllib/utils/tests/test_minibatch_utils.py b/rllib/utils/tests/test_minibatch_utils.py
index 879e8b522a1b..0d6b53d060be 100644
--- a/rllib/utils/tests/test_minibatch_utils.py
+++ b/rllib/utils/tests/test_minibatch_utils.py
@@ -72,7 +72,8 @@ def test_minibatch_cyclic_iterator(self):
                                 ]
                             ),
                             "seq_lens": seq_lens,
-                        }
+                        },
+                        _zero_padded=padding,
                     )
                     for i in range(len(agent_steps))
                 }
@@ -86,7 +87,10 @@ def test_minibatch_cyclic_iterator(self):
 
                 mb = MultiAgentBatch(sample_batches, num_env_steps)
                 batch_iter = MiniBatchCyclicIterator(
-                    mb, minibatch_size=minibatch_size, num_epochs=num_epochs
+                    mb,
+                    minibatch_size=minibatch_size,
+                    num_epochs=num_epochs,
+                    shuffle_batch_per_epoch=False,
                 )
                 print(config)
                 iteration_counter = 0

From c13647a6ec041a6dd6a10fd017295128c9682365 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 4 Sep 2024 13:16:03 +0200
Subject: [PATCH 12/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo.py       | 11 ++++-------
 rllib/algorithms/cql/cql.py         |  8 ++++----
 rllib/algorithms/marwil/marwil.py   | 16 ++++++++--------
 rllib/core/learner/learner.py       |  9 +++++++--
 rllib/core/learner/learner_group.py | 14 +++-----------
 5 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py
index ad21a9780dc3..22d2edfd2860 100644
--- a/rllib/algorithms/appo/appo.py
+++ b/rllib/algorithms/appo/appo.py
@@ -188,13 +188,10 @@ def training(
             target_network_update_freq: The frequency to update the target policy and
                 tune the kl loss coefficients that are used during training. After
                 setting this parameter, the algorithm waits for at least
-                `target_network_update_freq * minibatch_size * num_epochs` number of
-                samples to be trained on by the learner group before updating the target
-                networks and tuned the kl loss coefficients that are used during
-                training.
-                NOTE: This parameter is only applicable when using the Learner API
-                (enable_rl_module_and_learner=True).
-
+                `target_network_update_freq` number of environment samples to be trained
+                on before updating the target networks and tune the kl loss
+                coefficients. NOTE: This parameter is only applicable when using the
+                Learner API (enable_rl_module_and_learner=True).
 
         Returns:
             This updated AlgorithmConfig object.
diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py
index cf6cb5f7e041..79e94ccf75a4 100644
--- a/rllib/algorithms/cql/cql.py
+++ b/rllib/algorithms/cql/cql.py
@@ -306,7 +306,7 @@ def _training_step_new_api_stack(self) -> ResultDict:
         # Sampling from offline data.
         with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)):
             # Return an iterator in case we are using remote learners.
-            batch = self.offline_data.sample(
+            batch_or_iterator = self.offline_data.sample(
                 num_samples=self.config.train_batch_size_per_learner,
                 num_shards=self.config.num_learners,
                 return_iterator=self.config.num_learners > 1,
@@ -315,9 +315,9 @@ def _training_step_new_api_stack(self) -> ResultDict:
         # Updating the policy.
         with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
             # TODO (simon, sven): Check, if we should execute directly s.th. like
-            # update_from_iterator.
-            learner_results = self.learner_group.update_from_batch(
-                batch,
+            #  `LearnerGroup.update_from_iterator()`.
+            learner_results = self.learner_group._update(
+                batch=batch_or_iterator,
                 minibatch_size=self.config.train_batch_size_per_learner,
                 num_iters=self.config.dataset_num_iters_per_learner,
             )
diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py
index 7dbe8c85566f..d73e074fdff9 100644
--- a/rllib/algorithms/marwil/marwil.py
+++ b/rllib/algorithms/marwil/marwil.py
@@ -380,12 +380,12 @@ class (multi-/single-learner setup) and evaluation on
         """
         # Implement logic using RLModule and Learner API.
         # TODO (simon): Take care of sampler metrics: right
-        # now all rewards are `nan`, which possibly confuses
-        # the user that sth. is not right, although it is as
-        # we do not step the env.
+        #  now all rewards are `nan`, which possibly confuses
+        #  the user that sth. is not right, although it is as
+        #  we do not step the env.
         with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)):
             # Sampling from offline data.
-            batch = self.offline_data.sample(
+            batch_or_iterator = self.offline_data.sample(
                 num_samples=self.config.train_batch_size_per_learner,
                 num_shards=self.config.num_learners,
                 return_iterator=self.config.num_learners > 1,
@@ -394,11 +394,11 @@ class (multi-/single-learner setup) and evaluation on
         with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
             # Updating the policy.
             # TODO (simon, sven): Check, if we should execute directly s.th. like
-            # update_from_iterator.
-            learner_results = self.learner_group.update_from_batch(
-                batch,
+            #  `LearnerGroup.update_from_iterator()`.
+            learner_results = self.learner_group._update(
+                batch=batch_or_iterator,
                 minibatch_size=self.config.train_batch_size_per_learner,
-                num_epochs=self.config.dataset_num_iters_per_learner,
+                num_iters=self.config.dataset_num_iters_per_learner,
             )
 
             # Log training results.
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index db3bfaa1eab8..7e7ca16dbfa2 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -944,8 +944,7 @@ def update_from_batch(
                 # TODO (sven): Make this a more formal structure with its own type.
             num_epochs: The number of complete passes over the entire train batch. Each
                 pass might be further split into n minibatches (if `minibatch_size`
-                provided). The train batch is generated from the given `episodes`
-                through the Learner connector pipeline.
+                provided).
             minibatch_size: The size of minibatches to use to further split the train
                 `batch` into sub-batches. The `batch` is then iterated over n times
                 where n is `len(batch) // minibatch_size`.
@@ -1055,6 +1054,12 @@ def update_from_iterator(
         num_iters: int = None,
         **kwargs,
     ):
+        if "num_epochs" in kwargs:
+            raise ValueError(
+                "`num_epochs` arg NOT supported by Learner.update_from_iterator! Use "
+                "`num_iters` instead."
+            )
+
         self._check_is_built()
         # minibatch_size = minibatch_size or 32
 
diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py
index eb8d2a3cb05d..fe4aa9cfd09c 100644
--- a/rllib/core/learner/learner_group.py
+++ b/rllib/core/learner/learner_group.py
@@ -35,11 +35,7 @@
 )
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.checkpoints import Checkpointable
-from ray.rllib.utils.deprecation import (
-    Deprecated,
-    DEPRECATED_VALUE,
-    deprecation_warning,
-)
+from ray.rllib.utils.deprecation import Deprecated
 from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
 from ray.rllib.utils.minibatch_utils import (
     ShardBatchIterator,
@@ -352,16 +348,12 @@ def _update(
         async_update: bool = False,
         return_state: bool = False,
         num_epochs: int = 1,
+        num_iters: int = 1,
         minibatch_size: Optional[int] = None,
         shuffle_batch_per_epoch: bool = False,
-        # Deprecated args.
-        num_iters=DEPRECATED_VALUE,
         **kwargs,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
 
-        if num_iters != DEPRECATED_VALUE:
-            deprecation_warning(old="num_iters", new="num_epochs", error=True)
-
         # Define function to be called on all Learner actors (or the local learner).
         def _learner_update(
             _learner: Learner,
@@ -381,7 +373,7 @@ def _learner_update(
                     iterator=_batch_shard,
                     timesteps=_timesteps,
                     minibatch_size=minibatch_size,
-                    num_epochs=num_epochs,
+                    num_iters=num_iters,
                     **_kwargs,
                 )
             elif _batch_shard is not None:

From cd3869512063e34c848801ea4a2b3cfad6173a9f Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 4 Sep 2024 14:26:32 +0200
Subject: [PATCH 13/20] fixes

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py                   | 10 ++++++++++
 rllib/algorithms/tests/test_algorithm_config.py        |  2 +-
 ...te_modelv2_to_new_api_stack_by_policy_checkpoint.py |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index cebdd95e0006..1044757b7290 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -2083,6 +2083,8 @@ def training(
         ] = NotProvided,
         add_default_connectors_to_learner_pipeline: Optional[bool] = NotProvided,
         learner_config_dict: Optional[Dict[str, Any]] = NotProvided,
+        # Deprecated args.
+        num_sgd_iter=DEPRECATED_VALUE,
     ) -> "AlgorithmConfig":
         """Sets the training related configuration.
 
@@ -2187,6 +2189,14 @@ def training(
         Returns:
             This updated AlgorithmConfig object.
         """
+        if num_sgd_iter is not NotProvided:
+            deprecation_warning(
+                old="config.training(num_sgd_iter=..)",
+                new="config.training(num_epochs=..)",
+                error=False,
+            )
+            num_epochs = num_sgd_iter
+
         if gamma is not NotProvided:
             self.gamma = gamma
         if lr is not NotProvided:
diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py
index 03ec44a9aad9..9f81bd7abd9d 100644
--- a/rllib/algorithms/tests/test_algorithm_config.py
+++ b/rllib/algorithms/tests/test_algorithm_config.py
@@ -30,7 +30,7 @@ def test_running_specific_algo_with_generic_config(self):
         config = (
             AlgorithmConfig(algo_class=PPO)
             .environment("CartPole-v0")
-            .training(lr=0.12345, train_batch_size=3000)
+            .training(lr=0.12345, train_batch_size=3000, minibatch_size=300)
         )
         algo = config.build()
         self.assertTrue(algo.config.lr == 0.12345)
diff --git a/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py b/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py
index 5de20eee0f52..d67195f86a64 100644
--- a/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py
+++ b/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py
@@ -26,7 +26,7 @@
         .environment("CartPole-v1")
         .training(
             lr=0.0003,
-            num_sgd_iter=6,
+            num_epochs=6,
             vf_loss_coeff=0.01,
         )
     )

From 4f36d7af366e4545ffddc81abfeac2dc63a6a69b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 4 Sep 2024 14:35:37 +0200
Subject: [PATCH 14/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/policy/dynamic_tf_policy.py    | 9 +++++----
 rllib/policy/dynamic_tf_policy_v2.py | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py
index efd7b4024131..ac40205de94a 100644
--- a/rllib/policy/dynamic_tf_policy.py
+++ b/rllib/policy/dynamic_tf_policy.py
@@ -617,10 +617,11 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
                 )
             # Get the correct slice of the already loaded batch to use,
             # based on offset and batch size.
-            batch_size = self.config.get(
-                "minibatch_size",
-                self.config.get("sgd_minibatch_size", self.config["train_batch_size"]),
-            )
+            batch_size = self.config.get("minibatch_size")
+            if batch_size is None:
+                batch_size = self.config.get(
+                    "sgd_minibatch_size", self.config["train_batch_size"]
+                )
             if batch_size >= len(self._loaded_single_cpu_batch):
                 sliced_batch = self._loaded_single_cpu_batch
             else:
diff --git a/rllib/policy/dynamic_tf_policy_v2.py b/rllib/policy/dynamic_tf_policy_v2.py
index f11cba1ee57d..e2ad3d6da0ab 100644
--- a/rllib/policy/dynamic_tf_policy_v2.py
+++ b/rllib/policy/dynamic_tf_policy_v2.py
@@ -1003,10 +1003,11 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
                 )
             # Get the correct slice of the already loaded batch to use,
             # based on offset and batch size.
-            batch_size = self.config.get(
-                "minibatch_size",
-                self.config.get("sgd_minibatch_size", self.config["train_batch_size"]),
-            )
+            batch_size = self.config.get("minibatch_size")
+            if batch_size is None:
+                batch_size = self.config.get(
+                    "sgd_minibatch_size", self.config["train_batch_size"]
+                )
 
             if batch_size >= len(self._loaded_single_cpu_batch):
                 sliced_batch = self._loaded_single_cpu_batch

From 927ba3d1b81ec56a9214300bb1b072d0aa130300 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 4 Sep 2024 16:02:45 +0200
Subject: [PATCH 15/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py |  2 +-
 rllib/tests/test_lstm.py             | 71 ----------------------------
 2 files changed, 1 insertion(+), 72 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 1044757b7290..53377b1637d2 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -2189,7 +2189,7 @@ def training(
         Returns:
             This updated AlgorithmConfig object.
         """
-        if num_sgd_iter is not NotProvided:
+        if num_sgd_iter != DEPRECATED_VALUE:
             deprecation_warning(
                 old="config.training(num_sgd_iter=..)",
                 new="config.training(num_epochs=..)",
diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py
index eda9a0c3e440..969683d8ca38 100644
--- a/rllib/tests/test_lstm.py
+++ b/rllib/tests/test_lstm.py
@@ -173,77 +173,6 @@ def setUp(self) -> None:
     def tearDown(self) -> None:
         ray.shutdown()
 
-    def test_simple_optimizer_sequencing(self):
-        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
-        register_env("counter", lambda _: DebugCounterEnv())
-        config = (
-            PPOConfig()
-            .environment("counter")
-            .framework("tf")
-            .env_runners(num_env_runners=0, rollout_fragment_length=10)
-            .training(
-                train_batch_size=10,
-                minibatch_size=10,
-                num_epochs=1,
-                model={
-                    "custom_model": "rnn",
-                    "max_seq_len": 4,
-                    "vf_share_layers": True,
-                },
-            )
-        )
-        # Force-set simple_optimizer (fully deprecated soon).
-        config.simple_optimizer = True
-        ppo = config.build()
-        ppo.train()
-        ppo.train()
-        ppo.stop()
-
-        batch0 = pickle.loads(
-            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")
-        )
-        self.assertEqual(
-            batch0["sequences"].tolist(),
-            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]],
-        )
-        self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2])
-        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
-        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
-        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
-        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
-        self.assertTrue(
-            np.allclose(
-                batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1]
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1]
-            )
-        )
-
-        batch1 = pickle.loads(
-            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")
-        )
-        self.assertEqual(
-            batch1["sequences"].tolist(),
-            [
-                [[10], [11], [12], [13]],
-                [[14], [0], [0], [0]],
-                [[0], [1], [2], [3]],
-                [[4], [0], [0], [0]],
-            ],
-        )
-        self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [4, 1, 4, 1])
-        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
-        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
-        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
-        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
-        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
-        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
-        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
-        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
-
     def test_minibatch_sequencing(self):
         ModelCatalog.register_custom_model("rnn", RNNSpyModel)
         register_env("counter", lambda _: DebugCounterEnv())

From 3264f9c7911b40d830db5d82eac20e7e1074cf28 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 5 Sep 2024 11:45:30 +0200
Subject: [PATCH 16/20] APPO stateless cartpole not learning

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/policy/sample_batch.py                               | 7 ++++++-
 .../appo/multi_agent_stateless_cartpole_appo.py            | 2 +-
 rllib/tuned_examples/appo/stateless_cartpole_appo.py       | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py
index 33a0b5eea25b..36abaa36ad76 100644
--- a/rllib/policy/sample_batch.py
+++ b/rllib/policy/sample_batch.py
@@ -733,7 +733,12 @@ def _batch_slice(self, slice_: slice) -> "SampleBatch":
         infos = self.pop(SampleBatch.INFOS, None)
         data = tree.map_structure(lambda value: value[start:stop], self)
         if infos is not None:
-            data[SampleBatch.INFOS] = infos[start:stop]
+            # Slice infos according to SEQ_LENS.
+            info_slice_start = int(sum(self[SampleBatch.SEQ_LENS][:start]))
+            info_slice_stop = int(sum(self[SampleBatch.SEQ_LENS][start:stop]))
+            data[SampleBatch.INFOS] = infos[info_slice_start:info_slice_stop]
+            # Put infos back into `self`.
+            self[Columns.INFOS] = infos
 
         return SampleBatch(
             data,
diff --git a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py
index 4fb553a7b4fc..117ddeb32bd7 100644
--- a/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py
+++ b/rllib/tuned_examples/appo/multi_agent_stateless_cartpole_appo.py
@@ -36,7 +36,7 @@
     .training(
         train_batch_size_per_learner=600,
         lr=0.0005 * ((args.num_gpus or 1) ** 0.5),
-        num_epochs=6,
+        num_epochs=1,
         vf_loss_coeff=0.05,
         grad_clip=20.0,
     )
diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo.py b/rllib/tuned_examples/appo/stateless_cartpole_appo.py
index 3763d91c9109..99421ee58bf0 100644
--- a/rllib/tuned_examples/appo/stateless_cartpole_appo.py
+++ b/rllib/tuned_examples/appo/stateless_cartpole_appo.py
@@ -29,7 +29,7 @@
     )
     .training(
         lr=0.0005 * ((args.num_gpus or 1) ** 0.5),
-        num_epochs=6,
+        num_epochs=1,
         vf_loss_coeff=0.05,
         grad_clip=20.0,
     )

From 804bfc24ae9712da1c96e8949cffbcb2f973c5d7 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 5 Sep 2024 13:20:20 +0200
Subject: [PATCH 17/20] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
index 42f0398a97bd..6ef4f2dcbfaf 100644
--- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
@@ -14,7 +14,7 @@
     num_agents=2,
 )
 # Use `parser` to add your own custom command line options to this script
-# and (if needed) use their values toset up `config` below.
+# and (if needed) use their values to set up `config` below.
 args = parser.parse_args()
 
 register_env("multi_agent_pendulum", lambda cfg: MultiAgentPendulum(config=cfg))
@@ -26,7 +26,6 @@
         enable_env_runner_and_connector_v2=True,
     )
     .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents})
-    .env_runners(num_env_runners=4)
     .training(
         lr=0.0003,
         lambda_=0.1,

From a79630ac5d5bb3a0a99dd21828c4a0d45b83d130 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 5 Sep 2024 14:33:49 +0200
Subject: [PATCH 18/20] more ts for pendulum PPO

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/tuned_examples/ppo/pendulum_ppo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py
index 5df6e3e78855..a401ad720867 100644
--- a/rllib/tuned_examples/ppo/pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/pendulum_ppo.py
@@ -1,7 +1,7 @@
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
-parser = add_rllib_example_script_args(default_timesteps=400000, default_reward=-300)
+parser = add_rllib_example_script_args(default_timesteps=600000, default_reward=-300)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.

From 7bdab986e866d0df7519b0c605b934c369169099 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 17 Sep 2024 11:46:12 +0200
Subject: [PATCH 19/20] better PPO Pendulum tuned examples.

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/ppo/ppo.py                    |  2 +-
 .../add_states_from_episodes_to_batch.py       |  6 +++---
 .../ppo/multi_agent_pendulum_ppo.py            | 13 +++++++++----
 rllib/tuned_examples/ppo/pendulum_ppo.py       | 18 +++++++++++-------
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 558983875df6..459bb779db52 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -238,7 +238,7 @@ def training(
             lambda_: The lambda parameter for General Advantage Estimation (GAE).
                 Defines the exponential weight used between actually measured rewards
                 vs value function estimates over multiple time steps. Specifically,
-                `lambda_` balances short-term, low-variance estimates with longer-term,
+                `lambda_` balances short-term, low-variance estimates against long-term,
                 high-variance returns. A `lambda_` of 0.0 makes the GAE rely only on
                 immediate rewards (and vf predictions from there on, reducing variance,
                 but increasing bias), while a `lambda_` of 1.0 only incorporates vf
diff --git a/rllib/connectors/common/add_states_from_episodes_to_batch.py b/rllib/connectors/common/add_states_from_episodes_to_batch.py
index e4e5bfa2641a..69cf509dab54 100644
--- a/rllib/connectors/common/add_states_from_episodes_to_batch.py
+++ b/rllib/connectors/common/add_states_from_episodes_to_batch.py
@@ -186,7 +186,6 @@ def __init__(
         input_observation_space: Optional[gym.Space] = None,
         input_action_space: Optional[gym.Space] = None,
         *,
-        max_seq_len: Optional[int] = None,
         as_learner_connector: bool = False,
         **kwargs,
     ):
@@ -323,14 +322,15 @@ def __call__(
                 self.add_n_batch_items(
                     batch=batch,
                     column=Columns.STATE_IN,
-                    # items_to_add.shape=(B,[state-dim])  # B=episode len // max_seq_len
+                    # items_to_add.shape=(B,[state-dim])
+                    # B=episode len // max_seq_len
                     items_to_add=tree.map_structure(
                         # Explanation:
                         # [::max_seq_len]: only keep every Tth state.
                         # [:-1]: Shift state outs by one, ignore very last
                         # STATE_OUT (but therefore add the lookback/init state at
                         # the beginning).
-                        lambda i, o: np.concatenate([[i], o[:-1]])[::max_seq_len],
+                        lambda i, o, m=max_seq_len: np.concatenate([[i], o[:-1]])[::m],
                         look_back_state,
                         state_outs,
                     ),
diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
index 6ef4f2dcbfaf..310fcad8d7cd 100644
--- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
@@ -1,4 +1,5 @@
 from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.connectors.env_to_module import MeanStdFilter
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
@@ -26,11 +27,15 @@
         enable_env_runner_and_connector_v2=True,
     )
     .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents})
+    .env_runners(
+        env_to_module_connector=lambda env: MeanStdFilter(),
+    )
     .training(
-        lr=0.0003,
-        lambda_=0.1,
-        vf_clip_param=10.0,
-        num_epochs=6,
+        train_batch_size_per_learner=1024,
+        minibatch_size=128,
+        lr=0.0002 * (args.num_gpus or 1) ** 0.5,
+        gamma=0.95,
+        lambda_=0.5,
     )
     .rl_module(
         model_config_dict={
diff --git a/rllib/tuned_examples/ppo/pendulum_ppo.py b/rllib/tuned_examples/ppo/pendulum_ppo.py
index a401ad720867..e34ad094eed8 100644
--- a/rllib/tuned_examples/ppo/pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/pendulum_ppo.py
@@ -1,7 +1,8 @@
 from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.connectors.env_to_module import MeanStdFilter
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
-parser = add_rllib_example_script_args(default_timesteps=600000, default_reward=-300)
+parser = add_rllib_example_script_args(default_timesteps=400000, default_reward=-300)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.
@@ -9,16 +10,19 @@
 
 config = (
     PPOConfig()
+    .environment("Pendulum-v1")
     .env_runners(
         num_env_runners=2,
-        num_envs_per_env_runner=10,
+        num_envs_per_env_runner=20,
+        env_to_module_connector=lambda env: MeanStdFilter(),
     )
-    .environment("Pendulum-v1")
     .training(
-        lr=0.0003,
-        lambda_=0.1,
-        vf_clip_param=10.0,
-        num_epochs=6,
+        train_batch_size_per_learner=1024,
+        minibatch_size=128,
+        lr=0.0002 * (args.num_gpus or 1) ** 0.5,
+        gamma=0.95,
+        lambda_=0.5,
+        # num_epochs=8,
     )
     .rl_module(
         model_config_dict={

From c26ae5d43cd37ccb4e3b1a5eff5714c7538241a4 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 17 Sep 2024 12:45:18 +0200
Subject: [PATCH 20/20] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
index 310fcad8d7cd..ba2c94d0f408 100644
--- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
+++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py
@@ -28,7 +28,7 @@
     )
     .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents})
     .env_runners(
-        env_to_module_connector=lambda env: MeanStdFilter(),
+        env_to_module_connector=lambda env: MeanStdFilter(multi_agent=True),
     )
     .training(
         train_batch_size_per_learner=1024,