From 7728726080ed2b7a875608135ef4849c61406397 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 10 Apr 2024 22:42:19 +0200
Subject: [PATCH 01/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   |   3 +-
 rllib/examples/README.rst                     |   0
 ...raining_step_on_and_off_policy_combined.py | 179 ++++++++++--------
 .../checkpoint_by_custom_criteria.py          | 141 ++++++++------
 .../curriculum/curriculum_learning.py         |  45 +----
 5 files changed, 188 insertions(+), 180 deletions(-)
 create mode 100644 rllib/examples/README.rst

diff --git a/rllib/BUILD b/rllib/BUILD
index be75db2de236..e33b9b621d7e 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2112,14 +2112,13 @@ py_test(
     srcs = ["examples/checkpoints/cartpole_dqn_export.py"],
 )
 
-#@OldAPIStack
 py_test(
     name = "examples/checkpoints/checkpoint_by_custom_criteria",
     main = "examples/checkpoints/checkpoint_by_custom_criteria.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/checkpoints/checkpoint_by_custom_criteria.py"],
-    args = ["--stop-iters=3 --num-cpus=3"]
+    args = ["--enable-new-api-stack", "--stop-reward=150.0", "--num-cpus=9"]
 )
 
 #@OldAPIStack
diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
index 97ebf6a10d30..fd782f4828ff 100644
--- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
+++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
@@ -1,31 +1,57 @@
-"""Example of using a custom training workflow.
-
-This example creates a number of CartPole agents, some of which are trained with
-DQN, and some of which are trained with PPO. Both are executed concurrently
-with a custom training workflow.
+"""Example of using a custom training workflow via overriding `Algorithm.training_step`.
+
+This example:
+- defines a new Algorithm subclass and implements its `setup` (to add a DQN replay
+buffer) and `training_step` (to define a custom workflow using both on-policy and off-
+policy learning).
+- uses a multi-agent CartPole environment, in which N agents map to N RLModules,
+some of which are trained using PPO (on-policy) and some are trained by DQN
+(off-policy).
+- training all RLModules (policies) is performed concurrently using the above mentioned
+custom training workflow (defined in `training_step()`).
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see the PPO policy ("learnable_policy") does much
+better than "random":
+
++-------------------+------------+----------+------+----------------+
+| Trial name        | status     | loc      | iter | total time (s) |
+|                   |            |          |      |                |
+|-------------------+------------+----------+------+----------------+
+| PPO_multi_agen... | TERMINATED | 127. ... |   20 |         58.646 |
++-------------------+------------+----------+------+----------------+
+
++--------+-------------------+-----------------+--------------------+
+|     ts |   combined reward |   reward random |             reward |
+|        |                   |                 |   learnable_policy |
++--------+-------------------+-----------------+--------------------|
+|  80000 |            481.26 |           78.41 |             464.41 |
++--------+-------------------+-----------------+--------------------+
 """
 
-import argparse
-import os
-
-import ray
-from ray import air, tune
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
-from ray.rllib.algorithms.dqn.dqn import DQNConfig
-from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy
-from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy
-from ray.rllib.algorithms.ppo.ppo import PPOConfig
-from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
-from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy
-from ray.rllib.evaluation.postprocessing import Postprocessing
-from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
-from ray.rllib.execution.train_ops import train_one_step
-from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import (
-    MultiAgentReplayBuffer,
-)
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
+from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
-from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples
+from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.metrics import (
     NUM_AGENT_STEPS_SAMPLED,
@@ -33,45 +59,33 @@
     NUM_TARGET_UPDATES,
     LAST_TARGET_UPDATE_TS,
 )
-from ray.rllib.utils.sgd import standardized
-from ray.rllib.utils.test_utils import check_learning_achieved
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
 from ray.rllib.utils.typing import ResultDict
 from ray.tune.registry import register_env
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--torch", action="store_true")
-parser.add_argument("--mixed-torch-tf", action="store_true")
-parser.add_argument(
-    "--local-mode",
-    action="store_true",
-    help="Init Ray in local mode for easier debugging.",
-)
-parser.add_argument(
-    "--as-test",
-    action="store_true",
-    help="Whether this script should be run as a test: --stop-reward must "
-    "be achieved within --stop-timesteps AND --stop-iters.",
-)
-parser.add_argument(
-    "--stop-iters", type=int, default=600, help="Number of iterations to train."
-)
-parser.add_argument(
-    "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train."
-)
-# 600.0 = 4 (num_agents) x 150.0
-parser.add_argument(
-    "--stop-reward", type=float, default=600.0, help="Reward at which we stop training."
+
+parser = add_rllib_example_script_args(
+    default_iters=600, default_reward=600.0, default_timesteps=200000
 )
 
 
 # Define new Algorithm with custom `training_step()` method (training workflow).
-class MyAlgo(Algorithm):
+class MyOnPolicyOffPolicyAlgo(Algorithm):
     @override(Algorithm)
     def setup(self, config):
         # Call super's `setup` to create rollout workers.
         super().setup(config)
-        # Create local replay buffer.
-        self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1, capacity=50000)
+        # TODO (sven): Once we have a multi-agent capable replay buffer, use it here.
+        #  For now, we are ok with a separate buffer per DQN agent (as we are doing
+        #  independent learning anyways).
+        # Create N local replay buffers (one for each DQN agent).
+        self.local_replay_buffers = [
+            EpisodeMultiAgentReplayBuffer(num_shards=1, capacity=50000)
+            for n in range(0, args.num_agents, 2)
+        ]
 
     @override(Algorithm)
     def training_step(self) -> ResultDict:
@@ -152,19 +166,14 @@ def training_step(self) -> ResultDict:
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    assert not (
-        args.torch and args.mixed_torch_tf
-    ), "Use either --torch or --mixed-torch-tf, not both!"
-
-    ray.init(local_mode=args.local_mode)
 
     # Simple environment with 4 independent cartpole entities
     register_env(
-        "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4})
+        "env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg})
     )
 
     # Note that since the algorithm below does not include a default policy or
-    # policy configs, we have to explicitly set it in the multiagent config:
+    # policy configs, we have to explicitly set it in the multi_agent config:
     policies = {
         "ppo_policy": (
             PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTF1Policy,
@@ -184,36 +193,48 @@ def training_step(self) -> ResultDict:
         ),
     }
 
-    def policy_mapping_fn(agent_id, episode, worker, **kwargs):
+    def agent_to_module_mapping_fn(agent_id, episode, **kwargs):
         if agent_id % 2 == 0:
-            return "ppo_policy"
+            return "ppo_module"
         else:
-            return "dqn_policy"
+            return "dqn_module"
 
-    config = (
+    base_config = (
         AlgorithmConfig()
-        # TODO (Kourosh):  Migrate this to the new RLModule / Learner API.
-        .experimental(_enable_new_api_stack=False)
         .environment("multi_agent_cartpole")
-        .framework("torch" if args.torch else "tf")
         .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn)
-        .rollouts(num_rollout_workers=0, rollout_fragment_length=50)
-        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
-        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+        .rollouts(rollout_fragment_length=50)
         .reporting(metrics_num_episodes_for_smoothing=30)
     )
 
-    stop = {
-        "training_iteration": args.stop_iters,
-        "timesteps_total": args.stop_timesteps,
-        "episode_reward_mean": args.stop_reward,
-    }
+    run_rllib_example_script_experiment(base_config, args)
+
+
 
-    results = tune.Tuner(
-        MyAlgo, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop)
-    ).fit()
 
-    if args.as_test:
-        check_learning_achieved(results, args.stop_reward)
+    base_config = (
+        PPOConfig()
+        .environment("multi_agent_cartpole")
+        .multi_agent(
+            policies={"learnable_policy", "random"},
+            # Map to either random behavior or PPO learning behavior based on
+            # the agent's ID.
+            policy_mapping_fn=lambda agent_id, *args, **kwargs: [
+                "learnable_policy",
+                "random",
+            ][agent_id % 2],
+            # We need to specify this here, b/c the `forward_train` method of
+            # `RandomRLModule` (ModuleID="random") throws a not-implemented error.
+            policies_to_train=["learnable_policy"],
+        )
+        .rl_module(
+            rl_module_spec=MultiAgentRLModuleSpec(
+                module_specs={
+                    "learnable_policy": SingleAgentRLModuleSpec(),
+                    "random": SingleAgentRLModuleSpec(module_class=RandomRLModule),
+                }
+            ),
+        )
+    )
 
-    ray.shutdown()
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
index 79d2405d88da..f5d915bbb03f 100644
--- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
+++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
@@ -1,67 +1,92 @@
-# TODO (sven): Move this example script into the new API stack.
+"""Example extracting a checkpoint from n trials using one or more custom criteria.
 
-import argparse
-import os
+This example:
+- runs a simple CartPole experiment with three different learning rates (three tune
+"trials"). During the experiment, for each trial, we create a checkpoint at each
+iteration.
+- at the end of the experiment, we compare the trials and pick the one that performed
+best, based on the criterion: Lowest episode count per single iteration (for CartPole,
+a low episode count means the episodes are very long and thus the reward is also very
+high).
+- from that best trial (with the lowest episode count), we then pick those checkpoints
+that a) have the lowest policy loss (good) and b) have the highest value function loss
+(bad).
 
-import ray
-from ray import air, tune
-from ray.tune.registry import get_trainable_cls
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
-)
-parser.add_argument("--num-cpus", type=int, default=0)
-parser.add_argument(
-    "--framework",
-    choices=["tf", "tf2", "torch"],
-    default="torch",
-    help="The DL framework specifier.",
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see the performance of the three different learning
+rates used here:
+
++-----------------------------+------------+-----------------+--------+--------+
+| Trial name                  | status     | loc             |     lr |   iter |
+|-----------------------------+------------+-----------------+--------+--------+
+| PPO_CartPole-v1_d7dbe_00000 | TERMINATED | 127.0.0.1:98487 | 0.01   |     17 |
+| PPO_CartPole-v1_d7dbe_00001 | TERMINATED | 127.0.0.1:98488 | 0.001  |      8 |
+| PPO_CartPole-v1_d7dbe_00002 | TERMINATED | 127.0.0.1:98489 | 0.0001 |      9 |
++-----------------------------+------------+-----------------+--------+--------+
+
++------------------+-------+----------+----------------------+----------------------+
+|   total time (s) |    ts |   reward |   episode_reward_max |   episode_reward_min |
+|------------------+-------+----------+----------------------+----------------------+
+|          28.1068 | 39797 |   151.11 |                  500 |                   12 |
+|          13.304  | 18728 |   158.91 |                  500 |                   15 |
+|          14.8848 | 21069 |   167.36 |                  500 |                   13 |
++------------------+-------+----------+----------------------+----------------------+
+
++--------------------+
+|   episode_len_mean |
+|--------------------|
+|             151.11 |
+|             158.91 |
+|             167.36 |
++--------------------+
+"""
+
+from ray import tune
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
 )
-parser.add_argument("--stop-iters", type=int, default=200)
-parser.add_argument("--stop-timesteps", type=int, default=100000)
-parser.add_argument("--stop-reward", type=float, default=150.0)
-parser.add_argument(
-    "--local-mode",
-    action="store_true",
-    help="Init Ray in local mode for easier debugging.",
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(
+    default_reward=450.0, default_timesteps=100000, default_iters=200
 )
 
+
 if __name__ == "__main__":
     args = parser.parse_args()
 
-    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)
+    # Force-set `args.checkpoint_freq` to 1.
+    args.checkpoint_freq = 1
 
     # Simple generic config.
-    config = (
-        get_trainable_cls(args.run)
+    base_config = (
+        get_trainable_cls(args.algo)
         .get_default_config()
         .environment("CartPole-v1")
-        # Run with tracing enabled for tf2.
-        .framework(args.framework)
         # Run 3 trials.
         .training(
             lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341
-        )  # TEST
-        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
-        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+        )
     )
-
-    stop = {
-        "training_iteration": args.stop_iters,
-        "timesteps_total": args.stop_timesteps,
-        "episode_reward_mean": args.stop_reward,
-    }
-
     # Run tune for some iterations and generate checkpoints.
-    tuner = tune.Tuner(
-        args.run,
-        param_space=config.to_dict(),
-        run_config=air.RunConfig(
-            stop=stop, checkpoint_config=air.CheckpointConfig(checkpoint_frequency=1)
-        ),
-    )
-    results = tuner.fit()
+    results = run_rllib_example_script_experiment(base_config, args)
 
     # Get the best of the 3 trials by using some metric.
     # NOTE: Choosing the min `episodes_this_iter` automatically picks the trial
@@ -79,8 +104,8 @@
     best_result = results.get_best_result(metric=metric, mode="min", scope="all")
     value_best_metric = best_result.metrics_dataframe[metric].min()
     print(
-        "Best trial's lowest episode length (over all "
-        "iterations): {}".format(value_best_metric)
+        f"Best trial was the one with lr={best_result.metrics['config']['lr']}. "
+        "Reached lowest episode count ({value_best_metric}) in a single iteration."
     )
 
     # Confirm, we picked the right trial.
@@ -88,19 +113,23 @@
 
     # Get the best checkpoints from the trial, based on different metrics.
     # Checkpoint with the lowest policy loss value:
-    if config._enable_new_api_stack:
+    if args.enable_new_api_stack:
         policy_loss_key = "info/learner/default_policy/policy_loss"
     else:
         policy_loss_key = "info/learner/default_policy/learner_stats/policy_loss"
-    ckpt = results.get_best_result(metric=policy_loss_key, mode="min").checkpoint
-    print("Lowest pol-loss: {}".format(ckpt))
+    best_result = results.get_best_result(metric=policy_loss_key, mode="min")
+    ckpt = best_result.checkpoint
+    lowest_policy_loss = best_result.metrics_dataframe[policy_loss_key].min()
+    print(f"Checkpoint w/ lowest policy loss: {ckpt}")
+    print(f"Lowest policy loss: {lowest_policy_loss}")
 
     # Checkpoint with the highest value-function loss:
-    if config._enable_new_api_stack:
+    if args.enable_new_api_stack:
         vf_loss_key = "info/learner/default_policy/vf_loss"
     else:
         vf_loss_key = "info/learner/default_policy/learner_stats/vf_loss"
-    ckpt = results.get_best_result(metric=vf_loss_key, mode="max").checkpoint
-    print("Highest vf-loss: {}".format(ckpt))
-
-    ray.shutdown()
+    best_result = results.get_best_result(metric=vf_loss_key, mode="max")
+    ckpt = best_result.checkpoint
+    highest_value_fn_loss = best_result.metrics_dataframe[vf_loss_key].max()
+    print(f"Checkpoint w/ highest value function loss: {ckpt}")
+    print(f"Highest value function loss: {highest_value_fn_loss}")
diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py
index 5ce3b09ffd8d..f12be5ed2ea1 100644
--- a/rllib/examples/curriculum/curriculum_learning.py
+++ b/rllib/examples/curriculum/curriculum_learning.py
@@ -29,21 +29,6 @@
 torch, nn = try_import_torch()
 
 parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
-)
-parser.add_argument(
-    "--framework",
-    choices=["tf", "tf2", "torch"],
-    default="torch",
-    help="The DL framework specifier.",
-)
-parser.add_argument(
-    "--as-test",
-    action="store_true",
-    help="Whether this script should be run as a test: --stop-reward must "
-    "be achieved within --stop-timesteps AND --stop-iters.",
-)
 parser.add_argument(
     "--stop-iters", type=int, default=50, help="Number of iterations to train."
 )
@@ -56,11 +41,6 @@
     default=10000.0,
     help="Reward at which we stop training.",
 )
-parser.add_argument(
-    "--local-mode",
-    action="store_true",
-    help="Init Ray in local mode for easier debugging.",
-)
 
 
 def curriculum_fn(
@@ -99,7 +79,6 @@ def curriculum_fn(
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    ray.init(local_mode=args.local_mode)
 
     # Can also register the env creator function explicitly with:
     # register_env(
@@ -113,29 +92,9 @@ def curriculum_fn(
             CurriculumCapableEnv,
             env_config={"start_level": 1},
             # TODO (sven): Replace this API with a simple custom callback
-            #  (override `on_train_results` in which we simply set each EnvRunners'
-            #  env to the new stage).
+            #  (override `on_train_results` in which we set each EnvRunners' env to
+            #  the new stage).
             env_task_fn=curriculum_fn,
         )
-        .framework(args.framework)
         .rollouts(num_rollout_workers=2, num_envs_per_worker=5)
-        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
-        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
-    )
-
-    stop = {
-        "training_iteration": args.stop_iters,
-        "timesteps_total": args.stop_timesteps,
-        "episode_reward_mean": args.stop_reward,
-    }
-
-    tuner = tune.Tuner(
-        args.run,
-        param_space=config.to_dict(),
-        run_config=air.RunConfig(stop=stop, verbose=2),
     )
-    results = tuner.fit()
-
-    if args.as_test:
-        check_learning_achieved(results, args.stop_reward)
-    ray.shutdown()

From ac1ba1084a362c8e1a0d790f3c77e877c5c145f0 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 12 Apr 2024 16:16:47 +0200
Subject: [PATCH 02/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   |   3 +-
 rllib/algorithms/algorithm_config.py          |   6 +-
 ...raining_step_on_and_off_policy_combined.py |   7 +-
 .../checkpoint_by_custom_criteria.py          |   4 +-
 .../curriculum/curriculum_learning.py         | 290 +++++++++++++-----
 .../examples/evaluation/custom_evaluation.py  |   4 +-
 ...ock_paper_scissors_heuristic_vs_learned.py |   2 +-
 rllib/utils/test_utils.py                     |  14 +-
 8 files changed, 230 insertions(+), 100 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 633e879d75a0..c821f418ebb4 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2251,14 +2251,13 @@ py_test(
 # subdirectory: curriculum/
 # ....................................
 
-#@OldAPIStack
 py_test(
     name = "examples/curriculum/curriculum_learning",
     main = "examples/curriculum/curriculum_learning.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/curriculum/curriculum_learning.py"],
-    args = ["--as-test", "--stop-reward=800.0"]
+    args = ["--enable-new-api-stack", "--as-test"]
 )
 
 # subdirectory: debugging/
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index fa87ab0660c9..fa144251959b 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -1468,11 +1468,7 @@ def environment(
         if env is not NotProvided:
             self.env = env
         if env_config is not NotProvided:
-            deep_update(
-                self.env_config,
-                env_config,
-                True,
-            )
+            deep_update(self.env_config, env_config, True)
         if observation_space is not NotProvided:
             self.observation_space = observation_space
         if action_space is not NotProvided:
diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
index fd782f4828ff..c847bbc65a98 100644
--- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
+++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
@@ -168,9 +168,7 @@ def training_step(self) -> ResultDict:
     args = parser.parse_args()
 
     # Simple environment with 4 independent cartpole entities
-    register_env(
-        "env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg})
-    )
+    register_env("env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg}))
 
     # Note that since the algorithm below does not include a default policy or
     # policy configs, we have to explicitly set it in the multi_agent config:
@@ -209,9 +207,6 @@ def agent_to_module_mapping_fn(agent_id, episode, **kwargs):
 
     run_rllib_example_script_experiment(base_config, args)
 
-
-
-
     base_config = (
         PPOConfig()
         .environment("multi_agent_cartpole")
diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
index f5d915bbb03f..d48ec05965fe 100644
--- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
+++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
@@ -81,9 +81,7 @@
         .get_default_config()
         .environment("CartPole-v1")
         # Run 3 trials.
-        .training(
-            lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341
-        )
+        .training(lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341)
     )
     # Run tune for some iterations and generate checkpoints.
     results = run_rllib_example_script_experiment(base_config, args)
diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py
index f12be5ed2ea1..48425bb6cede 100644
--- a/rllib/examples/curriculum/curriculum_learning.py
+++ b/rllib/examples/curriculum/curriculum_learning.py
@@ -1,100 +1,236 @@
-# TODO (sven): Move this example script into the new API stack.
+"""Example of using an env-task curriculum via implementing a custom callback.
+
+This example:
+    - demonstrates how to define your own curriculum-capable environments using
+    gymnasium's FrozenLake env.
+    - defines a custom callback that gets called once per iteration and - if necessary -
+    changes the maps used by FrozenLake on all EnvRunners to a new task (by moving the
+    goal position further and further away from the starting position).
+    - also demonstrates an alternative approach via reloading/recreating an entirely new
+    env inside all EnvRunners.
+    - uses Tune and RLlib to curriculum-learn the env described above and compares 2
+    algorithms, one that does use curriculum learning vs one that does not.
+
+We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step
+limit of 16 to make it almost impossible for a non-curriculum policy to learn.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--no-curriculum` flag to disable curriculum learning and force your policy
+to be trained on the hardest task right away. With this option, the algorithm should NOT
+succeed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
 
-"""
-Example of a curriculum learning setup using the `TaskSettableEnv` API
-and the env_task_fn config.
 
-This example shows:
-  - Writing your own curriculum-capable environment using gym.Env.
-  - Defining a env_task_fn that determines, whether and which new task
-    the env(s) should be set to (using the TaskSettableEnv API).
-  - Using Tune and RLlib to curriculum-learn this env.
+Results to expect
+-----------------
+In the console output, you can see that only PPO policy that uses a curriculum can
+actually learn, whereas the one that is thrown into the toughest task right from the
+start never learns anything.
+
+Policy using the curriculum:
++-------------------------------+------------+-----------------+--------+
+| Trial name                    | status     | loc             |   iter |
+|-------------------------------+------------+-----------------+--------+
+| PPO_FrozenLake-v1_93ca4_00000 | TERMINATED | 127.0.0.1:73318 |     41 |
++-------------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|           97.652 | 164000 |        1 |            14.0348 |
++------------------+--------+----------+--------------------+
+
+Policy NOT using the curriculum (trying to solve the hardest task right away):
+
 
-You can visualize experiment results in ~/ray_results using TensorBoard.
 """
-import argparse
-import numpy as np
-import os
-
-import ray
-from ray import air, tune
-from ray.rllib.env.apis.task_settable_env import TaskSettableEnv, TaskType
-from ray.rllib.env.env_context import EnvContext
-from ray.rllib.examples.envs.classes.curriculum_capable_env import CurriculumCapableEnv
-from ray.rllib.utils.framework import try_import_tf, try_import_torch
-from ray.rllib.utils.test_utils import check_learning_achieved
-from ray.tune.registry import get_trainable_cls
+from functools import partial
 
-tf1, tf, tfv = try_import_tf()
-torch, nn = try_import_torch()
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.callbacks import DefaultCallbacks
+from ray.rllib.connectors.env_to_module import (
+    AddObservationsFromEpisodesToBatch,
+    FlattenObservations,
+    WriteObservationsToEpisodes,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
 
-parser = argparse.ArgumentParser()
+parser = add_rllib_example_script_args(default_iters=100, default_timesteps=600000)
 parser.add_argument(
-    "--stop-iters", type=int, default=50, help="Number of iterations to train."
+    "--recreate-entire-gym-env",
+    action="store_true",
+    help="Whether to recreate the entire gym.vector.Env on each EnvRunner whenever a "
+    "new task/map should be loaded.",
 )
 parser.add_argument(
-    "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train."
+    "--upgrade-task-threshold",
+    type=float,
+    default=0.99,
+    help="The mean episode return, upon reaching of which we increase the task by one.",
 )
 parser.add_argument(
-    "--stop-reward",
-    type=float,
-    default=10000.0,
-    help="Reward at which we stop training.",
+    "--no-curriculum",
+    action="store_true",
+    help="Whether to NOT use curriculum learning (and instead trying to solve the "
+    "hardest task right away).",
 )
 
 
-def curriculum_fn(
-    train_results: dict, task_settable_env: TaskSettableEnv, env_ctx: EnvContext
-) -> TaskType:
-    """Function returning a possibly new task to set `task_settable_env` to.
-
-    Args:
-        train_results: The train results returned by Algorithm.train().
-        task_settable_env: A single TaskSettableEnv object
-            used inside any worker and at any vector position. Use `env_ctx`
-            to get the worker_index, vector_index, and num_workers.
-        env_ctx: The env context object (i.e. env's config dict
-            plus properties worker_index, vector_index and num_workers) used
-            to setup the `task_settable_env`.
-
-    Returns:
-        TaskType: The task to set the env to. This may be the same as the
-            current one.
-    """
-    # Our env supports tasks 1 (default) to 5.
-    # With each task, rewards get scaled up by a factor of 10, such that:
-    # Level 1: Expect rewards between 0.0 and 1.0.
-    # Level 2: Expect rewards between 1.0 and 10.0, etc..
-    # We will thus raise the level/task each time we hit a new power of 10.0
-    new_task = int(np.log10(train_results["episode_reward_mean"]) + 2.1)
-    # Clamp between valid values, just in case:
-    new_task = max(min(new_task, 5), 1)
-    print(
-        f"Worker #{env_ctx.worker_index} vec-idx={env_ctx.vector_index}"
-        f"\nR={train_results['episode_reward_mean']}"
-        f"\nSetting env to task={new_task}"
-    )
-    return new_task
+ENV_OPTIONS = {
+    "is_slippery": False,
+    # Limit the number of steps the agent is allowed to make in the env to
+    # make it almost impossible to learn without the curriculum.
+    "max_episode_steps": 16,
+}
+
+# Our 3 tasks: 0=easiest, 1=medium, 2=hard
+ENV_MAPS = [
+    # 0
+    [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFGFFFFF",
+        "FFFFFFFF",
+        "HFFFFFFF",
+        "HHFFFFHF",
+        "FFFFFHHF",
+        "FHFFFFFF",
+    ],
+    # 1
+    [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFFFFFFF",
+        "FFFFFFFF",
+        "HFFFFFFF",
+        "HHFFGFHF",
+        "FFFFFHHF",
+        "FHFFFFFF",
+    ],
+    # 2
+    [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFFFFFFF",
+        "FFFFFFFF",
+        "HFFFFFFF",
+        "HHFFFFHF",
+        "FFFFFHHF",
+        "FHFFFFFG",
+    ],
+]
+
+
+# Simple function sent to an EnvRunner to change the map of all its gym.Envs from
+# the current one to a new (tougher) one, in which the goal position is further away
+# from the starting position. Note that a map is a list of strings, each one
+# representing one row in the map. Each character in the strings represent a single
+# field (S=starting position, H=hole (bad), F=frozen/free field (ok), G=goal (great!)).
+def _remote_fn(env_runner, new_task: int):
+    # We recreate the entire env object and re-assign it to the EnvRunner.
+    if True:  # args.recreate_entire_gym_env:
+        # A SingleAgentEnvRunner holds a gym.vector.Env under `self.env`.
+        env_runner.config.environment(env_config={"desc": ENV_MAPS[new_task]})
+        env_runner._make_env()
+        # Set the EnvRunners `_needs_initial_reset` property (to make sure we reset the
+        # next time we call `sample()` on the EnvRunner.
+        env_runner._needs_initial_reset = True
+    # We change each vectorized Env's map to the provided one.
+    else:
+        raise NotImplementedError
+
+
+class CallbackThatSetsNewEnvTask(DefaultCallbacks):
+    """Custom callback implementing `on_train_result()` for changing the envs' maps."""
+
+    def on_train_result(
+        self,
+        *,
+        algorithm: Algorithm,
+        result: dict,
+        **kwargs,
+    ) -> None:
+        # Hack: Store the current task inside a counter in our Algorithm.
+        # W/o a curriculum, the task is always 2 (hardest).
+        if args.no_curriculum:
+            algorithm._counters["current_env_task"] = 2
+        current_task = algorithm._counters["current_env_task"]
+
+        # If episode return is consistently 1.0, we switch to a more difficult task
+        # (if possible). If we already mastered the most difficult task, we publish
+        # our victory in the result dict.
+        result["task_solved"] = 0.0
+        current_return = result["sampler_results"]["episode_reward_mean"]
+        if current_return > args.upgrade_task_threshold:
+            if current_task < 2:
+                new_task = current_task + 1
+                print(
+                    f"Switching task/map on all EnvRunners to #{new_task} (0=easiest, "
+                    f"2=hardest), b/c R={current_return} on current task."
+                )
+                algorithm.workers.foreach_worker(
+                    func=partial(_remote_fn, new_task=new_task)
+                )
+                algorithm._counters["current_env_task"] = new_task
+
+            # Hardest task was solved (1.0) -> report this in the results dict.
+            elif current_return == 1.0:
+                result["task_solved"] = 1.0
 
 
 if __name__ == "__main__":
     args = parser.parse_args()
 
-    # Can also register the env creator function explicitly with:
-    # register_env(
-    #     "curriculum_env", lambda config: CurriculumCapableEnv(config))
-
-    config = (
-        get_trainable_cls(args.run)
+    base_config = (
+        get_trainable_cls(args.algo)
         .get_default_config()
-        # or "curriculum_env" if registered above
+        # Plug in our curriculum callbacks that controls when we should upgrade the env
+        # task based on the received return for the current task.
+        .callbacks(CallbackThatSetsNewEnvTask)
         .environment(
-            CurriculumCapableEnv,
-            env_config={"start_level": 1},
-            # TODO (sven): Replace this API with a simple custom callback
-            #  (override `on_train_results` in which we set each EnvRunners' env to
-            #  the new stage).
-            env_task_fn=curriculum_fn,
+            "FrozenLake-v1",
+            env_config={
+                # w/ curriculum: start with task=0 (easiest)
+                # w/o curriculum: start directly with hardest task 2.
+                "desc": ENV_MAPS[2 if args.no_curriculum else 0],
+                **ENV_OPTIONS,
+            },
+        )
+        .rollouts(
+            num_envs_per_worker=5,
+            env_to_module_connector=lambda env: [
+                AddObservationsFromEpisodesToBatch(),
+                FlattenObservations(),
+                WriteObservationsToEpisodes(),
+            ],
         )
-        .rollouts(num_rollout_workers=2, num_envs_per_worker=5)
+    )
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        # Reward directly does not matter to us as we would like to continue
+        # after the 0-task (easiest) reaches a return of 1.0. But we DO want to
+        # stop, once the entire task is done (reaches return of 1.0 in the most
+        # difficult task=2).
+        "task_solved": 1.0,
+        "timesteps_total": args.stop_timesteps,
+    }
+
+    run_rllib_example_script_experiment(
+        base_config, args, stop=stop, success_metric={"task_solved": 1.0}
     )
diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py
index b8e15c7739e0..7902ade2b520 100644
--- a/rllib/examples/evaluation/custom_evaluation.py
+++ b/rllib/examples/evaluation/custom_evaluation.py
@@ -140,5 +140,7 @@ def custom_eval_function(algorithm: Algorithm, eval_workers: WorkerSet) -> Resul
         base_config,
         args,
         stop=stop,
-        success_metric="evaluation/sampler_results/episode_reward_mean",
+        success_metric={
+            "evaluation/sampler_results/episode_reward_mean": args.stop_reward,
+        },
     )
diff --git a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
index 19a7a2edb529..49c23c02b584 100644
--- a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
+++ b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
@@ -143,5 +143,5 @@
         base_config,
         args,
         stop=stop,
-        success_metric="sampler_results/policy_reward_mean/learned",
+        success_metric={"sampler_results/policy_reward_mean/learned": args.stop_reward},
     )
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 216ecb77d752..ac55c29218f5 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1199,7 +1199,7 @@ def run_rllib_example_script_experiment(
     args: argparse.Namespace,
     *,
     stop: Optional[Dict] = None,
-    success_metric: str = "sampler_results/episode_reward_mean",
+    success_metric: Optional[Dict] = None,
 ) -> Union[ResultDict, tune.result_grid.ResultGrid]:
     """Given an algorithm config and some command line args, runs an experiment.
 
@@ -1261,7 +1261,7 @@ def run_rllib_example_script_experiment(
 
     if args.no_tune:
         algo = config.build()
-        for iter in range(args.stop_iters):
+        for _ in range(args.stop_iters):
             results = algo.train()
             print(f"R={results['sampler_results']['episode_reward_mean']}", end="")
             if "evaluation" in results:
@@ -1334,10 +1334,14 @@ def run_rllib_example_script_experiment(
     ).fit()
 
     if args.as_test:
+        if success_metric is None:
+            success_metric = {"sampler_results/episode_reward_mean": args.stop_reward}
+        # TODO (sven): Make this work for more than one metric (AND-logic?).
+        metric = next(iter(success_metric.keys()))
         check_learning_achieved(
-            results,
-            args.stop_reward,
-            metric=success_metric,
+            tune_results=results,
+            min_value=success_metric[metric],
+            metric=metric,
         )
 
     return results

From 44e436bf0323c4fac1f4f92ce63e14bb7700c174 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 12 Apr 2024 16:18:52 +0200
Subject: [PATCH 03/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/examples/README.rst                     |   0
 ...raining_step_on_and_off_policy_combined.py | 180 ++++++++----------
 2 files changed, 82 insertions(+), 98 deletions(-)
 delete mode 100644 rllib/examples/README.rst

diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
index c847bbc65a98..97ebf6a10d30 100644
--- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
+++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py
@@ -1,57 +1,31 @@
-"""Example of using a custom training workflow via overriding `Algorithm.training_step`.
-
-This example:
-- defines a new Algorithm subclass and implements its `setup` (to add a DQN replay
-buffer) and `training_step` (to define a custom workflow using both on-policy and off-
-policy learning).
-- uses a multi-agent CartPole environment, in which N agents map to N RLModules,
-some of which are trained using PPO (on-policy) and some are trained by DQN
-(off-policy).
-- training all RLModules (policies) is performed concurrently using the above mentioned
-custom training workflow (defined in `training_step()`).
-
-
-How to run this script
-----------------------
-`python [script file name].py --enable-new-api-stack --num-agents=2`
-
-For debugging, use the following additional command line options
-`--no-tune --num-env-runners=0`
-which should allow you to set breakpoints anywhere in the RLlib code and
-have the execution stop there for inspection and debugging.
-
-For logging to your WandB account, use:
-`--wandb-key=[your WandB API key] --wandb-project=[some project name]
---wandb-run-name=[optional: WandB run name (within the defined project)]`
-
-
-Results to expect
------------------
-In the console output, you can see the PPO policy ("learnable_policy") does much
-better than "random":
-
-+-------------------+------------+----------+------+----------------+
-| Trial name        | status     | loc      | iter | total time (s) |
-|                   |            |          |      |                |
-|-------------------+------------+----------+------+----------------+
-| PPO_multi_agen... | TERMINATED | 127. ... |   20 |         58.646 |
-+-------------------+------------+----------+------+----------------+
-
-+--------+-------------------+-----------------+--------------------+
-|     ts |   combined reward |   reward random |             reward |
-|        |                   |                 |   learnable_policy |
-+--------+-------------------+-----------------+--------------------|
-|  80000 |            481.26 |           78.41 |             464.41 |
-+--------+-------------------+-----------------+--------------------+
+"""Example of using a custom training workflow.
+
+This example creates a number of CartPole agents, some of which are trained with
+DQN, and some of which are trained with PPO. Both are executed concurrently
+with a custom training workflow.
 """
 
+import argparse
+import os
+
+import ray
+from ray import air, tune
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
-from ray.rllib.algorithms.ppo import PPOConfig
-from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
-from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
+from ray.rllib.algorithms.dqn.dqn import DQNConfig
+from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy
+from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy
+from ray.rllib.algorithms.ppo.ppo import PPOConfig
+from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
+from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
+from ray.rllib.execution.train_ops import train_one_step
+from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import (
+    MultiAgentReplayBuffer,
+)
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
-from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule
+from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.metrics import (
     NUM_AGENT_STEPS_SAMPLED,
@@ -59,33 +33,45 @@
     NUM_TARGET_UPDATES,
     LAST_TARGET_UPDATE_TS,
 )
-from ray.rllib.utils.test_utils import (
-    add_rllib_example_script_args,
-    run_rllib_example_script_experiment,
-)
+from ray.rllib.utils.sgd import standardized
+from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.rllib.utils.typing import ResultDict
 from ray.tune.registry import register_env
 
-
-parser = add_rllib_example_script_args(
-    default_iters=600, default_reward=600.0, default_timesteps=200000
+parser = argparse.ArgumentParser()
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--mixed-torch-tf", action="store_true")
+parser.add_argument(
+    "--local-mode",
+    action="store_true",
+    help="Init Ray in local mode for easier debugging.",
+)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=600, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train."
+)
+# 600.0 = 4 (num_agents) x 150.0
+parser.add_argument(
+    "--stop-reward", type=float, default=600.0, help="Reward at which we stop training."
 )
 
 
 # Define new Algorithm with custom `training_step()` method (training workflow).
-class MyOnPolicyOffPolicyAlgo(Algorithm):
+class MyAlgo(Algorithm):
     @override(Algorithm)
     def setup(self, config):
         # Call super's `setup` to create rollout workers.
         super().setup(config)
-        # TODO (sven): Once we have a multi-agent capable replay buffer, use it here.
-        #  For now, we are ok with a separate buffer per DQN agent (as we are doing
-        #  independent learning anyways).
-        # Create N local replay buffers (one for each DQN agent).
-        self.local_replay_buffers = [
-            EpisodeMultiAgentReplayBuffer(num_shards=1, capacity=50000)
-            for n in range(0, args.num_agents, 2)
-        ]
+        # Create local replay buffer.
+        self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1, capacity=50000)
 
     @override(Algorithm)
     def training_step(self) -> ResultDict:
@@ -166,12 +152,19 @@ def training_step(self) -> ResultDict:
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    assert not (
+        args.torch and args.mixed_torch_tf
+    ), "Use either --torch or --mixed-torch-tf, not both!"
+
+    ray.init(local_mode=args.local_mode)
 
     # Simple environment with 4 independent cartpole entities
-    register_env("env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg}))
+    register_env(
+        "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4})
+    )
 
     # Note that since the algorithm below does not include a default policy or
-    # policy configs, we have to explicitly set it in the multi_agent config:
+    # policy configs, we have to explicitly set it in the multiagent config:
     policies = {
         "ppo_policy": (
             PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTF1Policy,
@@ -191,45 +184,36 @@ def training_step(self) -> ResultDict:
         ),
     }
 
-    def agent_to_module_mapping_fn(agent_id, episode, **kwargs):
+    def policy_mapping_fn(agent_id, episode, worker, **kwargs):
         if agent_id % 2 == 0:
-            return "ppo_module"
+            return "ppo_policy"
         else:
-            return "dqn_module"
+            return "dqn_policy"
 
-    base_config = (
+    config = (
         AlgorithmConfig()
+        # TODO (Kourosh):  Migrate this to the new RLModule / Learner API.
+        .experimental(_enable_new_api_stack=False)
         .environment("multi_agent_cartpole")
+        .framework("torch" if args.torch else "tf")
         .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn)
-        .rollouts(rollout_fragment_length=50)
+        .rollouts(num_rollout_workers=0, rollout_fragment_length=50)
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
         .reporting(metrics_num_episodes_for_smoothing=30)
     )
 
-    run_rllib_example_script_experiment(base_config, args)
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
 
-    base_config = (
-        PPOConfig()
-        .environment("multi_agent_cartpole")
-        .multi_agent(
-            policies={"learnable_policy", "random"},
-            # Map to either random behavior or PPO learning behavior based on
-            # the agent's ID.
-            policy_mapping_fn=lambda agent_id, *args, **kwargs: [
-                "learnable_policy",
-                "random",
-            ][agent_id % 2],
-            # We need to specify this here, b/c the `forward_train` method of
-            # `RandomRLModule` (ModuleID="random") throws a not-implemented error.
-            policies_to_train=["learnable_policy"],
-        )
-        .rl_module(
-            rl_module_spec=MultiAgentRLModuleSpec(
-                module_specs={
-                    "learnable_policy": SingleAgentRLModuleSpec(),
-                    "random": SingleAgentRLModuleSpec(module_class=RandomRLModule),
-                }
-            ),
-        )
-    )
+    results = tune.Tuner(
+        MyAlgo, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop)
+    ).fit()
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
 
-    run_rllib_example_script_experiment(base_config, args)
+    ray.shutdown()

From 2cffca9672f4f84dd1e573d91e474f295144fba9 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 12 Apr 2024 17:24:23 +0200
Subject: [PATCH 04/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   |  15 +-
 .../curriculum/curriculum_learning.py         |   3 +-
 rllib/examples/envs/custom_gym_env.py         | 231 +++++++++---------
 rllib/utils/test_utils.py                     |  63 ++++-
 4 files changed, 168 insertions(+), 144 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index c821f418ebb4..ba1580c21b9b 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2276,24 +2276,13 @@ py_test(
 # subdirectory: envs/
 # ....................................
 
-#@OldAPIStack
-py_test(
-    name = "examples/envs/custom_gym_env_tf",
-    main = "examples/envs/custom_gym_env.py",
-    tags = ["team:rllib", "exclusive", "examples"],
-    size = "medium",
-    srcs = ["examples/envs/custom_gym_env.py"],
-    args = ["--as-test", "--framework=tf"]
-)
-
-#@OldAPIStack
 py_test(
-    name = "examples/envs/custom_gym_env_torch",
+    name = "examples/envs/custom_gym_env",
     main = "examples/envs/custom_gym_env.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/envs/custom_gym_env.py"],
-    args = ["--as-test", "--framework=torch"]
+    args = ["--enable-new-api-stack", "--as-test"]
 )
 
 #@OldAPIStack
diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py
index 48425bb6cede..921161bf194d 100644
--- a/rllib/examples/curriculum/curriculum_learning.py
+++ b/rllib/examples/curriculum/curriculum_learning.py
@@ -52,8 +52,7 @@
 +------------------+--------+----------+--------------------+
 
 Policy NOT using the curriculum (trying to solve the hardest task right away):
-
-
+[DOES NOT LEARN AT ALL]
 """
 from functools import partial
 
diff --git a/rllib/examples/envs/custom_gym_env.py b/rllib/examples/envs/custom_gym_env.py
index 95cdbcee81d5..616e3cee15b9 100644
--- a/rllib/examples/envs/custom_gym_env.py
+++ b/rllib/examples/envs/custom_gym_env.py
@@ -1,167 +1,154 @@
-# TODO (sven): Move this example script into the new API stack.
+"""Example of defining a custom gymnasium Env to be learned by an RLlib Algorithm.
 
-"""
-Example of a custom gym environment. Run this example for a demo.
+This example:
+    - demonstrates how to write your own (single-agent) gymnasium Env class, define its
+    physics and mechanics, the reward function used, the allowed actions (action space),
+    and the type of observations (observation space), etc..
+    - shows how to configure and setup this environment class within an RLlib
+    Algorithm config.
+    - runs the experiment with the configured algo, trying to solve the environment.
+
+To see more details on which env we are building for this example, take a look at the
+`SimpleCorridor` class defined below.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--corridor-length` option to set a custom length for the corridor. Note that
+for extremely long corridors, the algorithm should take longer to learn.
 
-This example shows the usage of:
-  - a custom environment
-  - Ray Tune for grid search to try different learning rates
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
 
-You can visualize experiment results in ~/ray_results using TensorBoard.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
 
-Run example with defaults:
-$ python custom_env.py
-For CLI options:
-$ python custom_env.py --help
+
+Results to expect
+-----------------
+You should see results similar to the following in your console output:
+
++--------------------------------+------------+-----------------+--------+
+| Trial name                     | status     | loc             |   iter |
+|--------------------------------+------------+-----------------+--------+
+| PPO_SimpleCorridor_78714_00000 | TERMINATED | 127.0.0.1:85794 |      7 |
++--------------------------------+------------+-----------------+--------+
+
++------------------+-------+----------+--------------------+
+|   total time (s) |    ts |   reward |   episode_len_mean |
+|------------------+-------+----------+--------------------|
+|          18.3034 | 28000 | 0.908918 |            12.9676 |
++------------------+-------+----------+--------------------+
 """
-import argparse
 import gymnasium as gym
 from gymnasium.spaces import Discrete, Box
 import numpy as np
-import os
 import random
 
-import ray
-from ray import air, tune
-from ray.rllib.env.env_context import EnvContext
-from ray.rllib.utils.framework import try_import_tf, try_import_torch
-from ray.rllib.utils.test_utils import check_learning_achieved
-from ray.tune.logger import pretty_print
-from ray.tune.registry import get_trainable_cls
-
-tf1, tf, tfv = try_import_tf()
-torch, nn = try_import_torch()
+from typing import Optional
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
-)
-parser.add_argument(
-    "--framework",
-    choices=["tf", "tf2", "torch"],
-    default="torch",
-    help="The DL framework specifier.",
-)
-parser.add_argument(
-    "--as-test",
-    action="store_true",
-    help="Whether this script should be run as a test: --stop-reward must "
-    "be achieved within --stop-timesteps AND --stop-iters.",
-)
-parser.add_argument(
-    "--stop-iters", type=int, default=50, help="Number of iterations to train."
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
 )
-parser.add_argument(
-    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
-)
-parser.add_argument(
-    "--stop-reward", type=float, default=0.1, help="Reward at which we stop training."
-)
-parser.add_argument(
-    "--no-tune",
-    action="store_true",
-    help="Run without Tune using a manual train loop instead. In this case,"
-    "use PPO without grid search and no TensorBoard.",
+from ray.tune.registry import get_trainable_cls, register_env  # noqa
+
+
+parser = add_rllib_example_script_args(
+    default_reward=0.9, default_iters=50, default_timesteps=100000
 )
 parser.add_argument(
-    "--local-mode",
-    action="store_true",
-    help="Init Ray in local mode for easier debugging.",
+    "--corridor-length",
+    type=int,
+    default=10,
+    help="The length of the corridor in fields. Note that this number includes the "
+    "starting- and goal states.",
 )
 
 
 class SimpleCorridor(gym.Env):
-    """Example of a custom env in which you have to walk down a corridor.
-
-    You can configure the length of the corridor via the env config."""
-
-    def __init__(self, config: EnvContext):
-        self.end_pos = config["corridor_length"]
+    """Example of a custom env in which the agent has to walk down a corridor.
+
+    ------------
+    |S........G|
+    ------------
+    , where S is the starting position, G is the goal position, and fields with '.'
+    mark free spaces, over which the agent may step. The length of the above example
+    corridor is 10.
+    Allowed actions are left (0) and right (1).
+    The reward function is -0.01 per step taken and a uniform random value between
+    0.5 and 1.5 when reaching the goal state.
+
+    You can configure the length of the corridor via the env's config. Thus, in your
+    AlgorithmConfig, you can do:
+    `config.environment(env_config={"corridor_length": ..})`.
+    """
+
+    def __init__(self, config: Optional[dict] = None):
+        config = config or {}
+        self.end_pos = config.get("corridor_length", 7)
         self.cur_pos = 0
         self.action_space = Discrete(2)
         self.observation_space = Box(0.0, self.end_pos, shape=(1,), dtype=np.float32)
-        # Set the seed. This is only used for the final (reach goal) reward.
-        self.reset(seed=config.worker_index * config.num_workers)
 
     def reset(self, *, seed=None, options=None):
         random.seed(seed)
         self.cur_pos = 0
-        return [self.cur_pos], {}
+        # Return obs and (empty) info dict.
+        return np.array([self.cur_pos], np.float32), {}
 
     def step(self, action):
         assert action in [0, 1], action
+        # Move left.
         if action == 0 and self.cur_pos > 0:
             self.cur_pos -= 1
+        # Move right.
         elif action == 1:
             self.cur_pos += 1
-        done = truncated = self.cur_pos >= self.end_pos
-        # Produce a random reward when we reach the goal.
+
+        # The environment only ever terminates when we reach the goal state.
+        terminated = self.cur_pos >= self.end_pos
+        truncated = False
+        # Produce a random reward from [0.5, 1.5] when we reach the goal.
+        reward = random.uniform(0.5, 1.5) if terminated else -0.01
+        infos = {}
         return (
-            [self.cur_pos],
-            random.random() * 2 if done else -0.1,
-            done,
+            np.array([self.cur_pos], np.float32),
+            reward,
+            terminated,
             truncated,
-            {},
+            infos,
         )
 
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    print(f"Running with following CLI options: {args}")
-
-    ray.init(local_mode=args.local_mode)
 
     # Can also register the env creator function explicitly with:
-    # register_env("corridor", lambda config: SimpleCorridor(config))
+    # register_env("corridor-env", lambda config: SimpleCorridor())
 
-    config = (
-        get_trainable_cls(args.run)
-        .get_default_config()
-        # or "corridor" if registered above
-        .environment(SimpleCorridor, env_config={"corridor_length": 5})
-        .framework(args.framework)
-        .rollouts(num_rollout_workers=1)
-        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
-        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
-    )
+    # Or you can hard code certain settings into the Env's constructor (`config`).
+    # register_env(
+    #    "corridor-env-w-len-100",
+    #    lambda config: SimpleCorridor({**config, **{"corridor_length": 100}}),
+    # )
 
-    stop = {
-        "training_iteration": args.stop_iters,
-        "timesteps_total": args.stop_timesteps,
-        "episode_reward_mean": args.stop_reward,
-    }
-
-    if args.no_tune:
-        # manual training with train loop using PPO and fixed learning rate
-        if args.run != "PPO":
-            raise ValueError("Only support --run PPO with --no-tune.")
-        print("Running manual train loop without Ray Tune.")
-        # use fixed learning rate instead of grid search (needs tune)
-        config.lr = 1e-3
-        algo = config.build()
-        # run manual training loop and print results after each iteration
-        for _ in range(args.stop_iters):
-            result = algo.train()
-            print(pretty_print(result))
-            # stop training of the target train steps or reward are reached
-            if (
-                result["timesteps_total"] >= args.stop_timesteps
-                or result["episode_reward_mean"] >= args.stop_reward
-            ):
-                break
-        algo.stop()
-    else:
-        # automated run with Tune and grid search and TensorBoard
-        print("Training automatically with Ray Tune")
-        tuner = tune.Tuner(
-            args.run,
-            param_space=config.to_dict(),
-            run_config=air.RunConfig(stop=stop),
-        )
-        results = tuner.fit()
+    # Or allow the RLlib user to set more c'tor options via their algo config:
+    # config.environment(env_config={[c'tor arg name]: [value]})
+    # register_env("corridor-env", lambda config: SimpleCorridor(config))
 
-        if args.as_test:
-            print("Checking if learning goals were achieved")
-            check_learning_achieved(results, args.stop_reward)
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            SimpleCorridor,  # or provide the registered string: "corridor-env"
+            env_config={"corridor_length": args.corridor_length},
+        )
+    )
 
-    ray.shutdown()
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index ac55c29218f5..ecd9da2f896e 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1194,6 +1194,11 @@ def should_check_eval(experiment):
     return result
 
 
+# TODO (sven): Make this the de-facto, well documented, and unified utility for most of
+#  our tests:
+#  - CI (label: "learning_tests")
+#  - release tests (benchmarks)
+#  - example scripts
 def run_rllib_example_script_experiment(
     base_config: "AlgorithmConfig",
     args: argparse.Namespace,
@@ -1204,24 +1209,57 @@ def run_rllib_example_script_experiment(
     """Given an algorithm config and some command line args, runs an experiment.
 
     There are some constraints on what properties must be defined in `args`.
-    It should ideally be generated via the ``
+    It should ideally be generated via calling
+    `args = add_rllib_example_script_args()`, which can be found in this very module
+    here.
+
+    The function sets up an Algorithm object from the given config (altered by the
+    contents of `args`), then runs the Algorithm via Tune (or manually, if
+    `args.no_tune` is set to True) using the stopping criteria in `stop`.
+
+    At the end of the experiment, if `args.as_test` is True, checks, whether the
+    Algorithm reached the `success_metric` (if None, use
+    `sampler_results/episode_reward_mean` with a minimum value of `args.stop_reward`).
+
+    See https://github.com/ray-project/ray/tree/master/rllib/examples for an overview
+    of all supported command line options.
 
     Args:
         base_config: The AlgorithmConfig object to use for this experiment. This base
             config will be automatically "extended" based on some of the provided
             `args`. For example, `args.num_env_runners` is used to set
             `config.num_rollout_workers`, etc..
-        args: A argparse.Namespace object which must have the following properties
-            defined: `stop_iters`, `stop_reward`, `stop_timesteps`, `no_tune`,
-            `verbose`, `checkpoint_freq`, `as_test`. Optionally, for wandb logging:
-            `wandb_key`, `wandb_project`, `wandb_run_name`.
+        args: A argparse.Namespace object, ideally returned by calling
+            `args = add_rllib_example_script_args()`. It must have the following
+            properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`,
+            `no_tune`, `verbose`, `checkpoint_freq`, `as_test`. Optionally, for WandB
+            logging: `wandb_key`, `wandb_project`, `wandb_run_name`.
+        stop: An optional dict mapping ResultDict key strings (using "/" in case of
+            nesting, e.g. "sampler_results/episode_reward_mean" for referring to
+            `result_dict['sampler_results']['episode_reward_mean']` to minimum
+            values, reaching of which will stop the experiment). Default is:
+            {
+            "sampler_results/episode_reward_mean": args.stop_reward,
+            "training_iteration": args.stop_iters,
+            "timesteps_total": args.stop_timesteps,
+            }
+        success_metric: Only relevent if `args.as_test` is True.
+            A dict mapping a single(!) ResultDict key string (using "/" in
+            case of nesting, e.g. "sampler_results/episode_reward_mean" for referring
+            to `result_dict['sampler_results']['episode_reward_mean']` to minimum
+            values, reaching of which will stop the experiment) to a single(!) minimum
+            value to be reached in order for the experiment to count as successful.
+            If `args.as_test` is True AND this `success_metric` is not reached with the
+            bounds defined by `stop`, will raise an Exception.
 
     Returns:
         The last ResultDict from a --no-tune run OR the tune.Tuner.fit()
         results.
     """
+    # Initialize Ray.
     ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)
 
+    # Define one or more stopping criteria.
     stop = stop or {
         "training_iteration": args.stop_iters,
         "sampler_results/episode_reward_mean": args.stop_reward,
@@ -1231,10 +1269,13 @@ def run_rllib_example_script_experiment(
     from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner
     from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner
 
-    # Extend the `base_config` based on provided `args`.
+    # Enhance the `base_config`, based on provided `args`.
     config = (
+        # Set the framework.
         base_config.framework(args.framework)
+        # Enable the new API stack?
         .experimental(_enable_new_api_stack=args.enable_new_api_stack)
+        # Define EnvRunner/RolloutWorker scaling and behavior.
         .rollouts(
             num_rollout_workers=args.num_env_runners,
             # Set up the correct env-runner to use depending on
@@ -1249,6 +1290,7 @@ def run_rllib_example_script_experiment(
                 )
             ),
         )
+        # Define compute resources used.
         .resources(
             # Old stack.
             num_gpus=0 if args.enable_new_api_stack else args.num_gpus,
@@ -1259,6 +1301,7 @@ def run_rllib_example_script_experiment(
         )
     )
 
+    # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object).
     if args.no_tune:
         algo = config.build()
         for _ in range(args.stop_iters):
@@ -1281,6 +1324,9 @@ def run_rllib_example_script_experiment(
                     return results
         return results
 
+    # Run the experiment using Ray Tune.
+
+    # Log results using WandB.
     callbacks = None
     if hasattr(args, "wandb_key") and args.wandb_key is not None:
         project = args.wandb_project or (
@@ -1295,8 +1341,9 @@ def run_rllib_example_script_experiment(
             )
         ]
 
-    progress_reporter = None
+    # Auto-configure a CLIReporter (to log the results to the console).
     # Use better ProgressReporter for multi-agent cases: List individual policy rewards.
+    progress_reporter = None
     if args.num_agents > 0:
         progress_reporter = CLIReporter(
             metric_columns={
@@ -1317,6 +1364,7 @@ def run_rllib_example_script_experiment(
     # `CLIReporter`.
     os.environ["RAY_AIR_NEW_OUTPUT"] = "0"
 
+    # Run the actual experiment (using Tune).
     results = tune.Tuner(
         config.algo_class,
         param_space=config,
@@ -1333,6 +1381,7 @@ def run_rllib_example_script_experiment(
         tune_config=tune.TuneConfig(num_samples=args.num_samples),
     ).fit()
 
+    # If run as a test, check whether we reached the specified success criteria.
     if args.as_test:
         if success_metric is None:
             success_metric = {"sampler_results/episode_reward_mean": args.stop_reward}

From d741906f636bdb3a60d25b804d4ac28ec97208c6 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 13 Apr 2024 19:01:27 +0200
Subject: [PATCH 05/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/examples/ray_tune/custom_experiment.py | 25 +++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py
index 9c42937dc0c6..1f326d3c002a 100644
--- a/rllib/examples/ray_tune/custom_experiment.py
+++ b/rllib/examples/ray_tune/custom_experiment.py
@@ -1,6 +1,18 @@
-# TODO (sven): Move this example script into the new API stack.
+"""Example of a custom Ray Tune experiment wrapping an RLlib Algorithm.
+
+You should only use such a customized workflow if the following conditions apply:
+- You know exactly what you are doing :)
+- Simply configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig
+is not enough and doesn't allow you to shape the Algorithm into behaving the way you'd
+like.
+-- Note that for complex and custom evaluation procedures there is a RLlib Algorithm
+config option (see examples/evaluation/custom_evaluation.py for more details).
+- Subclassing
+
+"""
+
+TODO: Continue docstring above
 
-"""Example of a custom experiment wrapped around an RLlib Algorithm."""
 import argparse
 
 import ray
@@ -48,11 +60,12 @@ def experiment(config):
     args = parser.parse_args()
 
     ray.init(num_cpus=3)
-    config = ppo.PPOConfig().environment("CartPole-v1")
-    config = config.to_dict()
-    config["train-iterations"] = args.train_iterations
+    base_config = (
+    )
+    base_config = base_config.to_dict()
+    base_config["train-iterations"] = args.train_iterations
 
     tune.Tuner(
-        tune.with_resources(experiment, ppo.PPO.default_resource_request(config)),
+        tune.with_resources(experiment, ppo.PPO.default_resource_request(base_config)),
         param_space=config,
     ).fit()

From 403c3afa521232aafff7de23c73c054b3da93095 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 15 Apr 2024 09:48:59 +0200
Subject: [PATCH 06/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   |  12 +-
 rllib/algorithms/algorithm_config.py          |  11 ++
 rllib/examples/ray_tune/custom_experiment.py  |  93 ++++++++----
 rllib/examples/ray_tune/custom_logger.py      | 142 +++++++++---------
 .../ray_tune/custom_progress_reporter.py      | 115 ++++++++++++++
 .../ray_tune/custom_train_function.py         |  69 ---------
 rllib/utils/test_utils.py                     |  20 ++-
 7 files changed, 276 insertions(+), 186 deletions(-)
 create mode 100644 rllib/examples/ray_tune/custom_progress_reporter.py
 delete mode 100644 rllib/examples/ray_tune/custom_train_function.py

diff --git a/rllib/BUILD b/rllib/BUILD
index ba1580c21b9b..82d844445c3d 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2834,7 +2834,6 @@ py_test(
 
 # subdirectory: ray_tune/
 # ....................................
-#@OldAPIStack
 py_test(
     name = "examples/ray_tune/custom_experiment",
     main = "examples/ray_tune/custom_experiment.py",
@@ -2844,23 +2843,20 @@ py_test(
     args = ["--train-iterations=10"]
 )
 
-#@OldAPIStack
 py_test(
     name = "examples/ray_tune/custom_logger",
     main = "examples/ray_tune/custom_logger.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/ray_tune/custom_logger.py"],
-    args = ["--stop-iters=3"]
 )
 
-#@OldAPIStack
 py_test(
-    name = "examples/ray_tune/custom_train_function",
-    main = "examples/ray_tune/custom_train_function.py",
+    name = "examples/ray_tune/custom_progres_reporter",
+    main = "examples/ray_tune/custom_progres_reporter.py",
     tags = ["team:rllib", "exclusive", "examples"],
-    size = "medium",
-    srcs = ["examples/ray_tune/custom_train_function.py"],
+    size = "small",
+    srcs = ["examples/ray_tune/custom_progres_reporter.py"],
 )
 
 # subdirectory: rl_modules/
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index fa144251959b..17d29c9effbd 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -4017,6 +4017,17 @@ def _validate_new_api_stack_settings(self):
     # TODO (sven): Once everything is on the new API stack, we won't need this method
     #  anymore.
     def _validate_to_be_deprecated_settings(self):
+        # Env task fn will be deprecated.
+        if self._enable_new_api_stack and self.env_task_fn is not None:
+            deprecation_warning(
+                old="AlgorithmConfig.env_task_fn",
+                help="The `env_task_fn` API is not supported on the new API stack! "
+                     "Curriculum learning should instead be implemented solely via "
+                     "custom callbacks. Check out our curriculum learning example "
+                     "script for more information: "
+                     "https://github.com/ray-project/ray/blob/master/rllib/examples/curriculum/curriculum_learning.py"  # noqa
+            )
+
         if self.preprocessor_pref not in ["rllib", "deepmind", None]:
             raise ValueError(
                 "`config.preprocessor_pref` must be either 'rllib', 'deepmind' or None!"
diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py
index 1f326d3c002a..2f371e87ce54 100644
--- a/rllib/examples/ray_tune/custom_experiment.py
+++ b/rllib/examples/ray_tune/custom_experiment.py
@@ -11,32 +11,44 @@
 
 """
 
-TODO: Continue docstring above
+"""Example of a custom training workflow. Run this for a demo.
 
-import argparse
+This example shows:
+  - using Tune trainable functions to implement custom training workflows
 
-import ray
+You can visualize experiment results in ~/ray_results using TensorBoard.
+"""
 from ray import train, tune
-import ray.rllib.algorithms.ppo as ppo
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--train-iterations", type=int, default=10)
-
-
-def experiment(config):
-    iterations = config.pop("train-iterations")
-
-    algo = ppo.PPO(config=config)
-    checkpoint = None
-    train_results = {}
-
-    # Train
-    for i in range(iterations):
-        train_results = algo.train()
-        if i % 2 == 0 or i == iterations - 1:
-            checkpoint = algo.save(train.get_context().get_trial_dir())
-        train.report(train_results)
-    algo.stop()
+from ray.rllib.algorithms.ppo import PPO, PPOConfig
+from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner
+
+
+def my_experiment(config):
+    iterations = config.pop("train-iterations", 10)
+
+    config = PPOConfig().update_from_dict(config).environment("CartPole-v1")
+
+    # Train for n iterations with high LR.
+    config.lr = 0.01
+    agent1 = config.build()
+    for _ in range(iterations):
+        result = agent1.train()
+        result["phase"] = 1
+        train.report(result)
+        phase1_time = result["timesteps_total"]
+    state = agent1.save()
+    agent1.stop()
+
+    # Train for n iterations with low LR
+    config.lr = 0.0001
+    agent2 = config.build()
+    agent2.restore(state)
+    for _ in range(iterations):
+        result = agent2.train()
+        result["phase"] = 2
+        result["timesteps_total"] += phase1_time  # keep time moving forward
+        train.report(result)
+    agent2.stop()
 
     # Manual Eval
     config["num_workers"] = 0
@@ -57,15 +69,34 @@ def experiment(config):
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
 
-    ray.init(num_cpus=3)
     base_config = (
+        PPOConfig()
+        .experimental(_enable_new_api_stack=True)
+        .environment("CartPole-v1")
+        .rollouts(
+            num_rollout_workers=0,
+            env_runner_cls=SingleAgentEnvRunner,
+        )
+    )
+    # Convert to a plain dict for Tune. Note that this is usually not needed, you can
+    # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object.
+    # However, for demonstration purposes, we show here how you can add other, arbitrary
+    # keys to the plain config dict and then pass these keys to your custom experiment
+    # function.
+    config_dict = base_config.to_dict()
+
+    # Set a Special flag signalling `my_train_fn` how many iters to do.
+    config_dict["train-iterations"] = 2
+
+    training_function = tune.with_resources(
+        my_experiment,
+        resources=base_config.algo_class.default_resource_request(base_config),
     )
-    base_config = base_config.to_dict()
-    base_config["train-iterations"] = args.train_iterations
 
-    tune.Tuner(
-        tune.with_resources(experiment, ppo.PPO.default_resource_request(base_config)),
-        param_space=config,
-    ).fit()
+    tuner = tune.Tuner(
+        training_function,
+        # Pass in your config dict.
+        param_space=config_dict,
+    )
+    tuner.fit()
diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py
index e45f57b18645..89b59222a31f 100644
--- a/rllib/examples/ray_tune/custom_logger.py
+++ b/rllib/examples/ray_tune/custom_logger.py
@@ -1,50 +1,59 @@
-# TODO (sven): Move this example script into the new API stack.
+"""Example showing how to define a custom Logger class for an RLlib Algorithm.
 
-"""
-This example script demonstrates how one can define a custom logger
-object for any RLlib Algorithm via the Algorithm's config's `logger_config` property.
-By default (logger_config=None), RLlib will construct a tune
-UnifiedLogger object, which logs JSON, CSV, and TBX output.
+The script uses the AlgorithmConfig's `debugging` API to setup the custom Logger:
+
+```
+config.debugging(logger_config={
+    "type": [some Logger subclass],
+    "ctor_arg1", ...,
+    "ctor_arg2", ...,
+})
+```
+
+All keys other than "type" in the logger_config dict will be passed into the Logger
+class's constructor.
+By default (logger_config=None), RLlib will construct a Ray Tune UnifiedLogger object,
+which logs results to JSON, CSV, and TBX.
+
+NOTE that a custom Logger is different from a custom `ProgressReporter`, which defines,
+how the (frequent) outputs to your console will be formatted. To see an example on how
+to write your own Progress reporter, see:
+https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_progress_reporter.py  # noqa
 
 Below examples include:
 - Disable logging entirely.
 - Using only one of tune's Json, CSV, or TBX loggers.
 - Defining a custom logger (by sub-classing tune.logger.py::Logger).
-"""
 
-import argparse
-import os
 
-from ray.rllib.utils.test_utils import check_learning_achieved
+How to run this script
+----------------------
+`python [script file name].py
+
+
+Results to expect
+-----------------
+You should see log lines similar to the following in your console output. Note that
+these logged lines will mix with the ones produced by Tune's default ProgressReporter.
+See above link on how to setup a custom one.
+
+ABC Avg-return: 20.609375; pi-loss: -0.02921550187703246
+ABC Avg-return: 32.28688524590164; pi-loss: -0.023369029412534572
+ABC Avg-return: 51.92; pi-loss: -0.017113141975661456
+ABC Avg-return: 76.16; pi-loss: -0.01305474770361625
+ABC Avg-return: 100.54; pi-loss: -0.007665307738129169
+ABC Avg-return: 132.33; pi-loss: -0.005010405003325517
+ABC Avg-return: 169.65; pi-loss: -0.008397869592997183
+ABC Avg-return: 203.17; pi-loss: -0.005611495616764371
+Flushing
+Closing
+
+"""
+
+from ray import air, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner
 from ray.tune.logger import Logger, LegacyLoggerCallback
-from ray.tune.registry import get_trainable_cls
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
-)
-parser.add_argument("--num-cpus", type=int, default=0)
-parser.add_argument(
-    "--framework",
-    choices=["tf", "tf2", "torch"],
-    default="torch",
-    help="The DL framework specifier.",
-)
-parser.add_argument(
-    "--as-test",
-    action="store_true",
-    help="Whether this script should be run as a test: --stop-reward must "
-    "be achieved within --stop-timesteps AND --stop-iters.",
-)
-parser.add_argument(
-    "--stop-iters", type=int, default=200, help="Number of iterations to train."
-)
-parser.add_argument(
-    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
-)
-parser.add_argument(
-    "--stop-reward", type=float, default=150.0, help="Reward at which we stop training."
-)
 
 
 class MyPrintLogger(Logger):
@@ -58,7 +67,15 @@ def _init(self):
 
     def on_result(self, result: dict):
         # Define, what should happen on receiving a `result` (dict).
-        print(f"{self.prefix}: {result}")
+        mean_return = result["sampler_results"]["episode_reward_mean"]
+        pi_loss = (
+            result["info"]["learner"]["default_policy"]["policy_loss"]
+        )
+        print(
+            f"{self.prefix} "
+            f"Avg-return: {mean_return} "
+            f"pi-loss: {pi_loss}"
+        )
 
     def close(self):
         # Releases all resources used by this logger.
@@ -66,25 +83,15 @@ def close(self):
 
     def flush(self):
         # Flushing all possible disk writes to permanent storage.
-        print("Flushing ;)", flush=True)
+        print("Flushing", flush=True)
 
 
 if __name__ == "__main__":
-    import ray
-    from ray import air, tune
-
-    args = parser.parse_args()
-
-    ray.init(num_cpus=args.num_cpus or None)
-
     config = (
-        get_trainable_cls(args.run)
-        .get_default_config()
-        .environment(
-            "CartPole-v1" if args.run not in ["DDPG", "TD3"] else "Pendulum-v1"
-        )
-        # Run with tracing enabled for tf2.
-        .framework(args.framework)
+        PPOConfig()
+        .experimental(_enable_new_api_stack=True)
+        .rollouts(env_runner_cls=SingleAgentEnvRunner)
+        .environment("CartPole-v1")
         # Setting up a custom logger config.
         # ----------------------------------
         # The following are different examples of custom logging setups:
@@ -115,27 +122,20 @@ def flush(self):
                 # "logdir": "/somewhere/on/my/file/system/"
             }
         )
-        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
-        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
     )
 
-    stop = {
-        "training_iteration": args.stop_iters,
-        "timesteps_total": args.stop_timesteps,
-        "episode_reward_mean": args.stop_reward,
-    }
+    stop = {"sampler_results/episode_reward_mean": 200.0}
 
-    tuner = tune.Tuner(
-        args.run,
-        param_space=config.to_dict(),
+    # Run the actual experiment (using Tune).
+    results = tune.Tuner(
+        config.algo_class,
+        param_space=config,
         run_config=air.RunConfig(
             stop=stop,
             verbose=2,
-            callbacks=[LegacyLoggerCallback([MyPrintLogger])],
+            # Plugin our own logger.
+            callbacks=[
+                LegacyLoggerCallback([MyPrintLogger]),
+            ],
         ),
-    )
-    results = tuner.fit()
-
-    if args.as_test:
-        check_learning_achieved(results, args.stop_reward)
-    ray.shutdown()
+    ).fit()
diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py
new file mode 100644
index 000000000000..f06f79312149
--- /dev/null
+++ b/rllib/examples/ray_tune/custom_progress_reporter.py
@@ -0,0 +1,115 @@
+"""Example showing how to set up a custom progress reporter for an RLlib Algorithm.
+
+The script sets the `progress_reporter` arg in the air.RunConfig and passes that to
+Tune's Tuner:
+
+```
+tune.Tuner(
+    param_space=...,  # <- your RLlib config
+    run_config=air.RunConfig(
+        progress_reporter=[some already instantiated TuneReporterBase object],
+    ),
+)
+```
+
+By default (progress_reporter=None), Tune will construct a default `CLIReporter` object,
+which reports the episode mean return, number of env steps sampled and -trained, and
+the total number of episodes run thus far.
+
+NOTE that a custom progress reporter is different from a custom `Logger`, which defines,
+how the (frequent) results are being formatted and written to e.g. a logfile.
+To see an example on how to write your own Logger, see:
+https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_logger.py
+
+
+How to run this script
+----------------------
+`python [script file name].py
+
+
+Results to expect
+-----------------
+You should see something similar to the following in your console output:
+
++---------------------+------------+-----------------+--------+------------------+
+| Trial name          | status     | loc             |   iter |   total time (s) |
+|---------------------+------------+-----------------+--------+------------------+
+| PPO_env_bb503_00000 | TERMINATED | 127.0.0.1:26303 |      5 |          30.3823 |
++---------------------+------------+-----------------+--------+------------------+
++-------+-------------------+------------------+------------------+------------------+
+|    ts |   combined return |   return policy1 |   return policy2 |   return policy3 |
+|-------+-------------------+------------------+------------------+------------------|
+| 20000 |             258.7 |            103.4 |            88.84 |            87.86 |
++-------+-------------------+------------------+------------------+------------------+
+
+"""
+from ray import air, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+
+
+my_multi_agent_progress_reporter = tune.CLIReporter(
+    # In the following dict, the keys are the (possibly nested) keys that can be found
+    # in RLlib's (PPO's) result dict, produced at every training iteration, and the
+    # values are the column names you would like to see in your console reports.
+    # Note that for nested result dict keys, you need to use slashes "/" to define the
+    # exact path.
+    metric_columns={
+        **{
+            "training_iteration": "iter",
+            "time_total_s": "total time (s)",
+            "timesteps_total": "ts",
+            # RLlib always sums up all agents' rewards and reports it under:
+            # result_dict[sampler_results][episode_reward_mean].
+            "sampler_results/episode_reward_mean": "combined return",
+        },
+        # Because RLlib sums up all returns of all agents, we would like to also
+        # see the individual agents' returns. We can find these under the result dict's
+        # 'policy_reward_mean' key (then the policy ID):
+        **{
+            f"policy_reward_mean/{pid}": f"return {pid}"
+            for pid in ["policy1", "policy2", "policy3"]
+        },
+    },
+)
+
+
+if __name__ == "__main__":
+    # Force Tuner to use old progress output as the new one silently ignores our custom
+    # `CLIReporter`.
+    # TODO (sven): Find out why we require this hack.
+    import os
+    os.environ["RAY_AIR_NEW_OUTPUT"] = "0"
+
+    # Register our multi-agent env with a fixed number of agents.
+    # The agents' IDs are 0, 1, and 2.
+    tune.register_env("env", lambda _: MultiAgentCartPole({"num_agents": 3}))
+
+    config = (
+        PPOConfig()
+        .experimental(_enable_new_api_stack=True)
+        .rollouts(env_runner_cls=MultiAgentEnvRunner)
+        .environment("env")
+        .multi_agent(
+            # Define 3 policies. Note that in our simple setup, they are all configured
+            # the exact same way (with a PPO default RLModule/NN).
+            policies={"policy1", "policy2", "policy3"},
+            # Map agent 0 to "policy1", etc..
+            policy_mapping_fn=lambda agent_id, episode: f"policy{agent_id + 1}",
+        )
+    )
+
+    stop = {"sampler_results/episode_reward_mean": 200.0}
+
+    # Run the actual experiment (using Tune).
+    results = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=2,
+            # Plugin our own progress reporter.
+            progress_reporter=my_multi_agent_progress_reporter,
+        ),
+    ).fit()
diff --git a/rllib/examples/ray_tune/custom_train_function.py b/rllib/examples/ray_tune/custom_train_function.py
deleted file mode 100644
index cf8bd4c40fbd..000000000000
--- a/rllib/examples/ray_tune/custom_train_function.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# TODO (sven): Move this example script into the new API stack.
-
-"""Example of a custom training workflow. Run this for a demo.
-
-This example shows:
-  - using Tune trainable functions to implement custom training workflows
-
-You can visualize experiment results in ~/ray_results using TensorBoard.
-"""
-import argparse
-import os
-
-import ray
-from ray import train, tune
-from ray.rllib.algorithms.ppo import PPO, PPOConfig
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--framework",
-    choices=["tf", "tf2", "torch"],
-    default="torch",
-    help="The DL framework specifier.",
-)
-
-
-def my_train_fn(config):
-    iterations = config.pop("train-iterations", 10)
-
-    config = PPOConfig().update_from_dict(config).environment("CartPole-v1")
-
-    # Train for n iterations with high LR.
-    config.lr = 0.01
-    agent1 = config.build()
-    for _ in range(iterations):
-        result = agent1.train()
-        result["phase"] = 1
-        train.report(result)
-        phase1_time = result["timesteps_total"]
-    state = agent1.save()
-    agent1.stop()
-
-    # Train for n iterations with low LR
-    config.lr = 0.0001
-    agent2 = config.build()
-    agent2.restore(state)
-    for _ in range(iterations):
-        result = agent2.train()
-        result["phase"] = 2
-        result["timesteps_total"] += phase1_time  # keep time moving forward
-        train.report(result)
-    agent2.stop()
-
-
-if __name__ == "__main__":
-    ray.init()
-    args = parser.parse_args()
-    config = {
-        # Special flag signalling `my_train_fn` how many iters to do.
-        "train-iterations": 2,
-        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
-        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
-        "num_workers": 0,
-        "framework": args.framework,
-    }
-    resources = PPO.default_resource_request(config)
-    tuner = tune.Tuner(
-        tune.with_resources(my_train_fn, resources=resources), param_space=config
-    )
-    tuner.fit()
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index ecd9da2f896e..14700569b09b 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1205,6 +1205,8 @@ def run_rllib_example_script_experiment(
     *,
     stop: Optional[Dict] = None,
     success_metric: Optional[Dict] = None,
+    trainable: Optional[Type] = None,
+    tune_callbacks: Optional[List] = None,
 ) -> Union[ResultDict, tune.result_grid.ResultGrid]:
     """Given an algorithm config and some command line args, runs an experiment.
 
@@ -1251,6 +1253,11 @@ def run_rllib_example_script_experiment(
             value to be reached in order for the experiment to count as successful.
             If `args.as_test` is True AND this `success_metric` is not reached with the
             bounds defined by `stop`, will raise an Exception.
+        trainable: The Trainable sub-class to run in the tune.Tuner. If None (default),
+            use the registered RLlib Algorithm class specified by args.algo.
+        tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner.
+            In case `args.wandb_key` is provided, will append a WandB logger to this
+            list.
 
     Returns:
         The last ResultDict from a --no-tune run OR the tune.Tuner.fit()
@@ -1327,19 +1334,18 @@ def run_rllib_example_script_experiment(
     # Run the experiment using Ray Tune.
 
     # Log results using WandB.
-    callbacks = None
     if hasattr(args, "wandb_key") and args.wandb_key is not None:
         project = args.wandb_project or (
             args.algo.lower() + "-" + re.sub("\\W+", "-", str(config.env).lower())
         )
-        callbacks = [
+        tune_callbacks.append(
             WandbLoggerCallback(
                 api_key=args.wandb_key,
                 project=project,
                 upload_checkpoints=True,
                 **({"name": args.wandb_run_name} if args.wandb_run_name else {}),
             )
-        ]
+        )
 
     # Auto-configure a CLIReporter (to log the results to the console).
     # Use better ProgressReporter for multi-agent cases: List individual policy rewards.
@@ -1351,10 +1357,10 @@ def run_rllib_example_script_experiment(
                     "training_iteration": "iter",
                     "time_total_s": "total time (s)",
                     "timesteps_total": "ts",
-                    "sampler_results/episode_reward_mean": "combined reward",
+                    "sampler_results/episode_reward_mean": "combined return",
                 },
                 **{
-                    f"policy_reward_mean/{pid}": f"reward {pid}"
+                    f"policy_reward_mean/{pid}": f"return {pid}"
                     for pid in config.policies
                 },
             },
@@ -1366,12 +1372,12 @@ def run_rllib_example_script_experiment(
 
     # Run the actual experiment (using Tune).
     results = tune.Tuner(
-        config.algo_class,
+        trainable or config.algo_class,
         param_space=config,
         run_config=air.RunConfig(
             stop=stop,
             verbose=args.verbose,
-            callbacks=callbacks,
+            callbacks=tune_callbacks,
             checkpoint_config=air.CheckpointConfig(
                 checkpoint_frequency=args.checkpoint_freq,
                 checkpoint_at_end=args.checkpoint_at_end,

From 2221ac6839b3712bae16a1df8eec81fe25fc482e Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 15 Apr 2024 14:19:38 +0200
Subject: [PATCH 07/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/ppo/ppo.py                   |   2 +-
 .../examples/evaluation/custom_evaluation.py  |  51 ++++++++-
 .../evaluation_parallel_to_training.py        | 103 +++++++++++++-----
 rllib/examples/ray_tune/custom_experiment.py  |  22 ++--
 rllib/examples/ray_tune/custom_logger.py      |   2 +-
 rllib/utils/test_utils.py                     |  11 +-
 6 files changed, 143 insertions(+), 48 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 9e6f55882738..361fa09c19cf 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -334,7 +334,7 @@ def validate(self) -> None:
         elif self._enable_new_api_stack:
             mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size
             tbs = self.train_batch_size_per_learner or self.train_batch_size
-            if mbs > tbs:
+            if isinstance(mbs, int) and isinstance(tbs, int) and mbs > tbs:
                 raise ValueError(
                     f"`mini_batch_size_per_learner` ({mbs}) must be <= "
                     f"`train_batch_size_per_learner` ({tbs}). In PPO, the train batch"
diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py
index 7902ade2b520..01e2e19b1677 100644
--- a/rllib/examples/evaluation/custom_evaluation.py
+++ b/rllib/examples/evaluation/custom_evaluation.py
@@ -16,6 +16,51 @@
 - It runs a defined number of episodes for evaluation purposes.
 - It collects the metrics from those runs, summarizes these metrics and returns them.
 
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+
+You can switch off custom evaluation (and use RLlib's default evaluation procedure)
+with the `--no-custom-eval` flag.
+
+You can switch on parallel evaluation to training using the
+`--evaluation-parallel-to-training` flag. See this example script here:
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py  # noqa
+for more details on running evaluation parallel to training.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following (or very similar) console output when running this script.
+Note that for each iteration, due to the definition of our custom evaluation function,
+we run 3 evaluation rounds per single training round.
+
+...
+Training iteration 1 -> evaluation round 0
+Training iteration 1 -> evaluation round 1
+Training iteration 1 -> evaluation round 2
+...
+...
++--------------------------------+------------+-----------------+--------+
+| Trial name                     | status     | loc             |   iter |
+|--------------------------------+------------+-----------------+--------+
+| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 |      4 |
++--------------------------------+------------+-----------------+--------+
++------------------+-------+----------+--------------------+
+|   total time (s) |    ts |   reward |   episode_len_mean |
+|------------------+-------+----------+--------------------|
+|          26.1973 | 16000 | 0.872034 |            13.7966 |
++------------------+-------+----------+--------------------+
 """
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
@@ -69,10 +114,10 @@ def custom_eval_function(algorithm: Algorithm, eval_workers: WorkerSet) -> Resul
     # processing.
     rollout_metrics = []
 
-    # For demonstration purposes, run through some arbitrary number of evaluation
-    # round within this one call. Note that this function is called once per
+    # For demonstration purposes, run through some number of evaluation
+    # rounds within this one call. Note that this function is called once per
     # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()`
-    # (which may be called manually by the user).
+    # (which can be called manually by the user).
     for i in range(3):
         print(f"Training iteration {algorithm.iteration} -> evaluation round {i}")
         # Sample episodes from the EnvRunners AND have them return only the thus
diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py
index 124ad92eb65f..84313a9e5ce5 100644
--- a/rllib/examples/evaluation/evaluation_parallel_to_training.py
+++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py
@@ -1,6 +1,72 @@
+"""Example showing how one can set up evaluation running in parallel to training.
+
+Such a setup saves a considerable amount of time during RL Algorithm training, b/c
+the next training step does NOT have to wait for the previous evaluation procedure to
+finish, but can already start running (in parallel).
+
+See RLlib's documentation for more details on the effect of the different supported
+evaluation configuration options:
+https://docs.ray.io/en/latest/rllib/rllib-advanced-api.html#customized-evaluation-during-training  # noqa
+
+For an example of how to write a fully customized evaluation function (which normally
+is not necessary as the config options are sufficient and offer maximum flexibility),
+see this example script here:
+
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py  # noqa
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--evaluation-num-workers` option to scale up the evaluation workers. Note
+that the requested evaluation duration (`--evaluation-duration` measured in
+`--evaluation-duration-unit`, which is either "timesteps" (default) or "episodes") is
+shared between all configured evaluation workers. For example, if the evaluation
+duration is 10 and the unit is "episodes" and you configured 5 workers, then each of the
+evaluation workers will run exactly 2 episodes.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following output (at the end of the experiment) in your console when
+running with a fixed number of 100k training timesteps
+(`--enable-new-api-stack --evaluation-duration=auto --stop-timesteps=100000
+--stop-reward=100000`):
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_1377a_00000 | TERMINATED | 127.0.0.1:73330 |     25 |
++-----------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|          71.7485 | 100000 |   476.51 |             476.51 |
++------------------+--------+----------+--------------------+
+
+When running without parallel evaluation (`--evaluation-not-parallel-to-training` flag),
+the experiment takes considerably longer (~70sec vs ~80sec):
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_f1788_00000 | TERMINATED | 127.0.0.1:75135 |     25 |
++-----------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|          81.7371 | 100000 |   494.68 |             494.68 |
++------------------+--------+----------+--------------------+
+"""
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
-from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner
-from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
 from ray.rllib.utils.test_utils import (
     add_rllib_example_script_args,
@@ -114,13 +180,10 @@ def on_train_result(self, *, algorithm, result, **kwargs):
             lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}),
         )
 
-    config = (
+    base_config = (
         get_trainable_cls(args.algo)
         .get_default_config()
-        .experimental(_enable_new_api_stack=args.enable_new_api_stack)
         .environment("env" if args.num_agents > 0 else "CartPole-v1")
-        # Run with tracing enabled for tf2.
-        .framework(args.framework)
         # Use a custom callback that asserts that we are running the
         # configured exact number of episodes per evaluation OR - in auto
         # mode - run at least as many episodes as we have eval workers.
@@ -148,28 +211,11 @@ def on_train_result(self, *, algorithm, result, **kwargs):
             # Switch off exploratory behavior for better (greedy) results.
             evaluation_config={"explore": False},
         )
-        .rollouts(
-            num_rollout_workers=args.num_env_runners,
-            # Set up the correct env-runner to use depending on
-            # old-stack/new-stack and multi-agent settings.
-            env_runner_cls=(
-                None
-                if not args.enable_new_api_stack
-                else SingleAgentEnvRunner
-                if args.num_agents == 0
-                else MultiAgentEnvRunner
-            ),
-        )
-        .resources(
-            num_learner_workers=args.num_gpus,
-            num_gpus_per_learner_worker=int(args.num_gpus != 0),
-            num_cpus_for_local_worker=1,
-        )
     )
 
     # Add a simple multi-agent setup.
     if args.num_agents > 0:
-        config.multi_agent(
+        base_config.multi_agent(
             policies={f"p{i}" for i in range(args.num_agents)},
             policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
         )
@@ -180,4 +226,11 @@ def on_train_result(self, *, algorithm, result, **kwargs):
         "timesteps_total": args.stop_timesteps,
     }
 
-    run_rllib_example_script_experiment(config, args, stop=stop)
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={
+            "evaluation/sampler_results/episode_reward_mean": args.stop_reward,
+        },
+    )
diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py
index 2f371e87ce54..58c1d9b0b88d 100644
--- a/rllib/examples/ray_tune/custom_experiment.py
+++ b/rllib/examples/ray_tune/custom_experiment.py
@@ -2,21 +2,19 @@
 
 You should only use such a customized workflow if the following conditions apply:
 - You know exactly what you are doing :)
-- Simply configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig
-is not enough and doesn't allow you to shape the Algorithm into behaving the way you'd
-like.
--- Note that for complex and custom evaluation procedures there is a RLlib Algorithm
-config option (see examples/evaluation/custom_evaluation.py for more details).
-- Subclassing
+- Configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig
+is not sufficient and doesn't allow you to shape the Algorithm into behaving the way
+you'd like. Note that for complex, custom evaluation procedures there are many
+AlgorithmConfig options one can use (for more details, see:
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py).  # noqa
+- Subclassing an RLlib Algorithm class and overriding the new class' `training_step`
+method is not sufficient and doesn't allow you to define the algorithm's execution
+logic the way you'd like. See an example here on how to customize the algorithm's
+`training_step()` method:
+https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py  # noqa
 
-"""
-
-"""Example of a custom training workflow. Run this for a demo.
 
-This example shows:
-  - using Tune trainable functions to implement custom training workflows
 
-You can visualize experiment results in ~/ray_results using TensorBoard.
 """
 from ray import train, tune
 from ray.rllib.algorithms.ppo import PPO, PPOConfig
diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py
index 89b59222a31f..b9c50d41a204 100644
--- a/rllib/examples/ray_tune/custom_logger.py
+++ b/rllib/examples/ray_tune/custom_logger.py
@@ -28,7 +28,7 @@
 
 How to run this script
 ----------------------
-`python [script file name].py
+`python [script file name].py`
 
 
 Results to expect
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 14700569b09b..69599901f9d5 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1245,14 +1245,13 @@ def run_rllib_example_script_experiment(
             "training_iteration": args.stop_iters,
             "timesteps_total": args.stop_timesteps,
             }
-        success_metric: Only relevent if `args.as_test` is True.
+        success_metric: Only relevant if `args.as_test` is True.
             A dict mapping a single(!) ResultDict key string (using "/" in
             case of nesting, e.g. "sampler_results/episode_reward_mean" for referring
-            to `result_dict['sampler_results']['episode_reward_mean']` to minimum
-            values, reaching of which will stop the experiment) to a single(!) minimum
-            value to be reached in order for the experiment to count as successful.
-            If `args.as_test` is True AND this `success_metric` is not reached with the
-            bounds defined by `stop`, will raise an Exception.
+            to `result_dict['sampler_results']['episode_reward_mean']` to a single(!)
+            minimum value to be reached in order for the experiment to count as
+            successful. If `args.as_test` is True AND this `success_metric` is not
+            reached with the bounds defined by `stop`, will raise an Exception.
         trainable: The Trainable sub-class to run in the tune.Tuner. If None (default),
             use the registered RLlib Algorithm class specified by args.algo.
         tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner.

From 93491ced5543b2954b714dae48d66aee60a89356 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 15 Apr 2024 15:42:59 +0200
Subject: [PATCH 08/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   |   1 -
 rllib/algorithms/algorithm.py                 |   4 +
 rllib/algorithms/algorithm_config.py          |   8 +-
 rllib/examples/ray_tune/custom_experiment.py  | 161 +++++++++++++-----
 rllib/examples/ray_tune/custom_logger.py      |  10 +-
 .../ray_tune/custom_progress_reporter.py      |   1 +
 6 files changed, 129 insertions(+), 56 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 82d844445c3d..69523777fd9e 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2840,7 +2840,6 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/ray_tune/custom_experiment.py"],
-    args = ["--train-iterations=10"]
 )
 
 py_test(
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index c784e809fdd8..736d93ac18e8 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -2328,6 +2328,10 @@ def load_checkpoint(self, checkpoint_dir: str) -> None:
         if self.config._enable_new_api_stack:
             learner_state_dir = os.path.join(checkpoint_dir, "learner")
             self.learner_group.load_state(learner_state_dir)
+            # Make also sure, all training EnvRunners get the just loaded weights.
+            weights = self.learner_group.get_weights()
+            self.workers.local_worker().set_weights(weights)
+            self.workers.sync_weights()
 
         # Call the `on_checkpoint_loaded` callback.
         self.callbacks.on_checkpoint_loaded(algorithm=self)
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 17d29c9effbd..66127751c520 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -4022,10 +4022,10 @@ def _validate_to_be_deprecated_settings(self):
             deprecation_warning(
                 old="AlgorithmConfig.env_task_fn",
                 help="The `env_task_fn` API is not supported on the new API stack! "
-                     "Curriculum learning should instead be implemented solely via "
-                     "custom callbacks. Check out our curriculum learning example "
-                     "script for more information: "
-                     "https://github.com/ray-project/ray/blob/master/rllib/examples/curriculum/curriculum_learning.py"  # noqa
+                "Curriculum learning should instead be implemented solely via "
+                "custom callbacks. Check out our curriculum learning example "
+                "script for more information: "
+                "https://github.com/ray-project/ray/blob/master/rllib/examples/curriculum/curriculum_learning.py",  # noqa
             )
 
         if self.preprocessor_pref not in ["rllib", "deepmind", None]:
diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py
index 58c1d9b0b88d..330c96ab5c21 100644
--- a/rllib/examples/ray_tune/custom_experiment.py
+++ b/rllib/examples/ray_tune/custom_experiment.py
@@ -16,58 +16,127 @@
 
 
 """
+from typing import Dict
+
+import numpy as np
 from ray import train, tune
-from ray.rllib.algorithms.ppo import PPO, PPOConfig
+from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner
+from ray.rllib.utils.framework import try_import_torch
+
+torch, _ = try_import_torch()
 
 
-def my_experiment(config):
-    iterations = config.pop("train-iterations", 10)
+def my_experiment(config: Dict):
 
-    config = PPOConfig().update_from_dict(config).environment("CartPole-v1")
+    # Extract the number of iterations to run from the config.
+    train_iterations = config.pop("train-iterations", 2)
+    eval_episodes_to_do = config.pop("eval-episodes", 1)
+
+    config = (
+        PPOConfig()
+        .update_from_dict(config)
+        .experimental(_enable_new_api_stack=True)
+        .environment("CartPole-v1")
+    )
 
     # Train for n iterations with high LR.
-    config.lr = 0.01
-    agent1 = config.build()
-    for _ in range(iterations):
-        result = agent1.train()
-        result["phase"] = 1
-        train.report(result)
-        phase1_time = result["timesteps_total"]
-    state = agent1.save()
-    agent1.stop()
-
-    # Train for n iterations with low LR
-    config.lr = 0.0001
-    agent2 = config.build()
-    agent2.restore(state)
-    for _ in range(iterations):
-        result = agent2.train()
-        result["phase"] = 2
-        result["timesteps_total"] += phase1_time  # keep time moving forward
-        train.report(result)
-    agent2.stop()
-
-    # Manual Eval
-    config["num_workers"] = 0
-    eval_algo = ppo.PPO(config=config)
-    eval_algo.restore(checkpoint)
-    env = eval_algo.workers.local_worker().env
-
-    obs, info = env.reset()
-    done = False
-    eval_results = {"eval_reward": 0, "eval_eps_length": 0}
-    while not done:
-        action = eval_algo.compute_single_action(obs)
-        next_obs, reward, done, truncated, info = env.step(action)
-        eval_results["eval_reward"] += reward
-        eval_results["eval_eps_length"] += 1
+    config.training(lr=0.001)
+    algo_high_lr = config.build()
+    for _ in range(train_iterations):
+        train_results = algo_high_lr.train()
+        # Add the phase to the result dict.
+        train_results["phase"] = 1
+        train.report(train_results)
+        phase_high_lr_time = train_results["timesteps_total"]
+    checkpoint_training_high_lr = algo_high_lr.save()
+    algo_high_lr.stop()
+
+    # Train for n iterations with low LR.
+    config.training(lr=0.00001)
+    algo_low_lr = config.build()
+    # Load state from the high-lr algo into this one.
+    algo_low_lr.restore(checkpoint_training_high_lr)
+    for _ in range(train_iterations):
+        train_results = algo_low_lr.train()
+        # Add the phase to the result dict.
+        train_results["phase"] = 2
+        # keep time moving forward
+        train_results["timesteps_total"] += phase_high_lr_time
+        train.report(train_results)
+
+    print(
+        f"after low lr training: episode return={train_results['episode_reward_mean']}"
+    )
+    print(f"model weights={algo_low_lr.workers.local_worker().module.get_state()}")
+    checkpoint_training_low_lr = algo_low_lr.save()
+    algo_low_lr.stop()
+
+    # After training, run a manual evaluation procedure.
+
+    # Set the number of EnvRunners for collecting training data to 0 (local
+    # worker only).
+    config.rollouts(num_rollout_workers=0)
+
+    eval_algo = config.build()
+    # Load state from the low-lr algo into this one.
+    eval_algo.restore(checkpoint_training_low_lr)
+    # The algo's local worker (SingleAgentEnvRunner) that holds a
+    # gym.vector.Env object and an RLModule for computing actions.
+    local_env_runner = eval_algo.workers.local_worker()
+    # Extract the gymnasium env object from the created algo (its local
+    # SingleAgentEnvRunner worker). Note that the env in this single-agent
+    # case is a gymnasium vector env and that we get its first sub-env here.
+    env = local_env_runner.env.envs[0]
+
+    # The local worker (SingleAgentEnvRunner)
+    rl_module = local_env_runner.module
+
+    print(f"LOADED eval model weights={rl_module.get_state()}")
+
+    # Run a very simple env loop and add up rewards over a single episode.
+    obs, infos = env.reset()
+    episode_returns = []
+    episode_lengths = []
+    sum_rewards = length = 0
+    num_episodes = 0
+    while num_episodes < eval_episodes_to_do:
+        # Call the RLModule's `forward_inference()` method to compute an
+        # action.
+        rl_module_out = rl_module.forward_inference(
+            {
+                "obs": torch.from_numpy(np.expand_dims(obs, 0)),  # <- add B=1
+            }
+        )
+        action_logits = rl_module_out["action_dist_inputs"][0]  # <- remove B=1
+        action = np.argmax(action_logits.detach().cpu().numpy())  # act greedily
+
+        # Step the env.
+        obs, reward, terminated, truncated, info = env.step(action)
+
+        # Acculumate stats and reset the env, if necessary.
+        sum_rewards += reward
+        length += 1
+        if terminated or truncated:
+            num_episodes += 1
+            episode_returns.append(sum_rewards)
+            episode_lengths.append(length)
+            sum_rewards = length = 0
+            obs, infos = env.reset()
+
+    # Compile evaluation results.
+    eval_results = {
+        "eval_returns": episode_returns,
+        "eval_episode_lengths": episode_lengths,
+    }
+    # Combine the most recent training results with the just collected
+    # evaluation results.
     results = {**train_results, **eval_results}
+    # Report everything.
     train.report(results)
 
 
 if __name__ == "__main__":
-
     base_config = (
         PPOConfig()
         .experimental(_enable_new_api_stack=True)
@@ -84,8 +153,11 @@ def my_experiment(config):
     # function.
     config_dict = base_config.to_dict()
 
-    # Set a Special flag signalling `my_train_fn` how many iters to do.
-    config_dict["train-iterations"] = 2
+    # Set a Special flag signalling `my_experiment` how many training steps to
+    # perform on each: the high learning rate and low learning rate.
+    config_dict["train-iterations"] = 5
+    # Set a Special flag signalling `my_experiment` how many episodes to evaluate for.
+    config_dict["eval-episodes"] = 3
 
     training_function = tune.with_resources(
         my_experiment,
@@ -97,4 +169,7 @@ def my_experiment(config):
         # Pass in your config dict.
         param_space=config_dict,
     )
-    tuner.fit()
+    results = tuner.fit()
+    best_results = results.get_best_result()
+
+    print(f"evaluation episode returns={best_results.metrics['eval_returns']}")
diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py
index b9c50d41a204..f2988bcbd6c4 100644
--- a/rllib/examples/ray_tune/custom_logger.py
+++ b/rllib/examples/ray_tune/custom_logger.py
@@ -68,14 +68,8 @@ def _init(self):
     def on_result(self, result: dict):
         # Define, what should happen on receiving a `result` (dict).
         mean_return = result["sampler_results"]["episode_reward_mean"]
-        pi_loss = (
-            result["info"]["learner"]["default_policy"]["policy_loss"]
-        )
-        print(
-            f"{self.prefix} "
-            f"Avg-return: {mean_return} "
-            f"pi-loss: {pi_loss}"
-        )
+        pi_loss = result["info"]["learner"]["default_policy"]["policy_loss"]
+        print(f"{self.prefix} " f"Avg-return: {mean_return} " f"pi-loss: {pi_loss}")
 
     def close(self):
         # Releases all resources used by this logger.
diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py
index f06f79312149..01244f8deb71 100644
--- a/rllib/examples/ray_tune/custom_progress_reporter.py
+++ b/rllib/examples/ray_tune/custom_progress_reporter.py
@@ -80,6 +80,7 @@
     # `CLIReporter`.
     # TODO (sven): Find out why we require this hack.
     import os
+
     os.environ["RAY_AIR_NEW_OUTPUT"] = "0"
 
     # Register our multi-agent env with a fixed number of agents.

From cf44987ddf56a94f9b6e4bae25a7ff9a13221211 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 15 Apr 2024 16:36:00 +0200
Subject: [PATCH 09/11] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/ppo/ppo.py                  |  2 +-
 rllib/examples/ray_tune/custom_experiment.py | 30 +++++++++++++++-----
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 73001b31b330..8d5157c7bc4b 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -334,7 +334,7 @@ def validate(self) -> None:
         elif self._enable_new_api_stack:
             mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size
             tbs = self.train_batch_size_per_learner or self.train_batch_size
-            if isinstance(mbs, int) and isinstance(tbs, int) and mbs > tbs:
+            if mbs > tbs:
                 raise ValueError(
                     f"`mini_batch_size_per_learner` ({mbs}) must be <= "
                     f"`train_batch_size_per_learner` ({tbs}). In PPO, the train batch"
diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py
index 330c96ab5c21..35464f66667b 100644
--- a/rllib/examples/ray_tune/custom_experiment.py
+++ b/rllib/examples/ray_tune/custom_experiment.py
@@ -14,7 +14,29 @@
 https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py  # noqa
 
 
-
+How to run this script
+----------------------
+`python [script file name].py`
+
+
+Results to expect
+-----------------
+You should see the following output (at the end of the experiment) in your console:
+
+╭───────────────────────────────────────────────────────────────────────────────────────
+│ Trial name                              status         iter     total time (s)      ts
+├───────────────────────────────────────────────────────────────────────────────────────
+│ my_experiment_CartPole-v1_77083_00000   TERMINATED       10            36.7799   60000
+╰───────────────────────────────────────────────────────────────────────────────────────
+╭───────────────────────────────────────────────────────╮
+│     reward    episode_len_mean     episodes_this_iter │
+├───────────────────────────────────────────────────────┤
+│    254.821             254.821                     12 │
+╰───────────────────────────────────────────────────────╯
+evaluation episode returns=[500.0, 500.0, 500.0]
+
+Note that evaluation results (on the CartPole-v1 env) should be close to perfect
+(episode return of ~500.0) as we are acting greedily inside the evaluation procedure.
 """
 from typing import Dict
 
@@ -65,10 +87,6 @@ def my_experiment(config: Dict):
         train_results["timesteps_total"] += phase_high_lr_time
         train.report(train_results)
 
-    print(
-        f"after low lr training: episode return={train_results['episode_reward_mean']}"
-    )
-    print(f"model weights={algo_low_lr.workers.local_worker().module.get_state()}")
     checkpoint_training_low_lr = algo_low_lr.save()
     algo_low_lr.stop()
 
@@ -92,8 +110,6 @@ def my_experiment(config: Dict):
     # The local worker (SingleAgentEnvRunner)
     rl_module = local_env_runner.module
 
-    print(f"LOADED eval model weights={rl_module.get_state()}")
-
     # Run a very simple env loop and add up rewards over a single episode.
     obs, infos = env.reset()
     episode_returns = []

From 9f872667108eaaf93f072b8f1affa7182dbe1449 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 15 Apr 2024 19:24:53 +0200
Subject: [PATCH 10/11] fixes

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                    |  6 +++---
 rllib/tests/test_ray_client.py | 33 ++++-----------------------------
 2 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 69523777fd9e..6e54ec137224 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2851,11 +2851,11 @@ py_test(
 )
 
 py_test(
-    name = "examples/ray_tune/custom_progres_reporter",
-    main = "examples/ray_tune/custom_progres_reporter.py",
+    name = "examples/ray_tune/custom_progress_reporter",
+    main = "examples/ray_tune/custom_progress_reporter.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "small",
-    srcs = ["examples/ray_tune/custom_progres_reporter.py"],
+    srcs = ["examples/ray_tune/custom_progress_reporter.py"],
 )
 
 # subdirectory: rl_modules/
diff --git a/rllib/tests/test_ray_client.py b/rllib/tests/test_ray_client.py
index b60e8b1f1b3e..06d573537607 100644
--- a/rllib/tests/test_ray_client.py
+++ b/rllib/tests/test_ray_client.py
@@ -14,12 +14,12 @@ def test_connection(self):
             assert ray.util.client.ray.is_connected()
         assert ray.util.client.ray.is_connected() is False
 
-    def test_custom_train_function(self):
+    def test_custom_experiment(self):
         with ray_start_client_server():
             assert ray.util.client.ray.is_connected()
 
             config = {
-                # Special flag signalling `my_train_fn` how many iters to do.
+                # Special flag signalling `my_experiment` how many iters to do.
                 "train-iterations": 2,
                 "lr": 0.01,
                 # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
@@ -28,10 +28,10 @@ def test_custom_train_function(self):
                 "framework": "tf",
             }
             resources = ppo.PPO.default_resource_request(config)
-            from ray.rllib.examples.ray_tune.custom_train_function import my_train_fn
+            from ray.rllib.examples.ray_tune.custom_experiment import my_experiment
 
             tune.Tuner(
-                tune.with_resources(my_train_fn, resources),
+                tune.with_resources(my_experiment, resources),
                 param_space=config,
             ).fit()
 
@@ -53,31 +53,6 @@ def test_cartpole_lstm(self):
                 run_config=air.RunConfig(stop=stop, verbose=2),
             ).fit()
 
-    def test_custom_experiment(self):
-
-        with ray_start_client_server(ray_init_kwargs={"num_cpus": 3}):
-            assert ray.util.client.ray.is_connected()
-
-            config = ppo.PPOConfig().environment("CartPole-v1")
-            # Special flag signalling `experiment` how many iters to do.
-            config = config.to_dict()
-            config["train-iterations"] = 2
-
-            from ray.rllib.examples.ray_tune.custom_experiment import experiment
-
-            # Ray client does not seem to propagate the `fn._resources` property
-            # correctly for imported functions. As a workaround, we can wrap the
-            # imported function which forces a full transfer.
-            def wrapped_experiment(config):
-                experiment(config)
-
-            tune.Tuner(
-                tune.with_resources(
-                    wrapped_experiment, ppo.PPO.default_resource_request(config)
-                ),
-                param_space=config,
-            ).fit()
-
 
 if __name__ == "__main__":
     import pytest

From 8a1002a00d0a144e662576a12e3a17b1c9e21779 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 15 Apr 2024 22:25:46 +0200
Subject: [PATCH 11/11] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 6e54ec137224..729f8b1c353e 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2854,7 +2854,7 @@ py_test(
     name = "examples/ray_tune/custom_progress_reporter",
     main = "examples/ray_tune/custom_progress_reporter.py",
     tags = ["team:rllib", "exclusive", "examples"],
-    size = "small",
+    size = "medium",
     srcs = ["examples/ray_tune/custom_progress_reporter.py"],
 )