ray-project · sven1977 · Apr 16, 2024 · Apr 10, 2024 · Apr 12, 2024 · Apr 12, 2024
@@ -2834,33 +2834,28 @@ py_test(
 
 # subdirectory: ray_tune/
 # ....................................
-#@OldAPIStack
 py_test(
     name = "examples/ray_tune/custom_experiment",
     main = "examples/ray_tune/custom_experiment.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/ray_tune/custom_experiment.py"],
-    args = ["--train-iterations=10"]
 )
 
-#@OldAPIStack
 py_test(
     name = "examples/ray_tune/custom_logger",
     main = "examples/ray_tune/custom_logger.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/ray_tune/custom_logger.py"],
-    args = ["--stop-iters=3"]
 )
 
-#@OldAPIStack
 py_test(
-    name = "examples/ray_tune/custom_train_function",
-    main = "examples/ray_tune/custom_train_function.py",
+    name = "examples/ray_tune/custom_progress_reporter",
+    main = "examples/ray_tune/custom_progress_reporter.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
-    srcs = ["examples/ray_tune/custom_train_function.py"],
+    srcs = ["examples/ray_tune/custom_progress_reporter.py"],
 )
 
 # subdirectory: rl_modules/

@@ -2328,6 +2328,10 @@ def load_checkpoint(self, checkpoint_dir: str) -> None:
         if self.config._enable_new_api_stack:
             learner_state_dir = os.path.join(checkpoint_dir, "learner")
             self.learner_group.load_state(learner_state_dir)
+            # Make also sure, all training EnvRunners get the just loaded weights.
+            weights = self.learner_group.get_weights()
+            self.workers.local_worker().set_weights(weights)
+            self.workers.sync_weights()
 
         # Call the `on_checkpoint_loaded` callback.
         self.callbacks.on_checkpoint_loaded(algorithm=self)

@@ -4064,6 +4064,17 @@ def _validate_new_api_stack_settings(self):
     # TODO (sven): Once everything is on the new API stack, we won't need this method
     #  anymore.
     def _validate_to_be_deprecated_settings(self):
+        # Env task fn will be deprecated.
+        if self._enable_new_api_stack and self.env_task_fn is not None:
+            deprecation_warning(
+                old="AlgorithmConfig.env_task_fn",
+                help="The `env_task_fn` API is not supported on the new API stack! "
+                "Curriculum learning should instead be implemented solely via "
+                "custom callbacks. Check out our curriculum learning example "
+                "script for more information: "
+                "https://github.com/ray-project/ray/blob/master/rllib/examples/curriculum/curriculum_learning.py",  # noqa
+            )
+
         if self.preprocessor_pref not in ["rllib", "deepmind", None]:
             raise ValueError(
                 "`config.preprocessor_pref` must be either 'rllib', 'deepmind' or None!"

@@ -16,6 +16,51 @@
 - It runs a defined number of episodes for evaluation purposes.
 - It collects the metrics from those runs, summarizes these metrics and returns them.
 
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+
+You can switch off custom evaluation (and use RLlib's default evaluation procedure)
+with the `--no-custom-eval` flag.
+
+You can switch on parallel evaluation to training using the
+`--evaluation-parallel-to-training` flag. See this example script here:
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py  # noqa
+for more details on running evaluation parallel to training.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following (or very similar) console output when running this script.
+Note that for each iteration, due to the definition of our custom evaluation function,
+we run 3 evaluation rounds per single training round.
+
+...
+Training iteration 1 -> evaluation round 0
+Training iteration 1 -> evaluation round 1
+Training iteration 1 -> evaluation round 2
+...
+...
++--------------------------------+------------+-----------------+--------+
+| Trial name                     | status     | loc             |   iter |
+|--------------------------------+------------+-----------------+--------+
+| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 |      4 |
++--------------------------------+------------+-----------------+--------+
++------------------+-------+----------+--------------------+
+|   total time (s) |    ts |   reward |   episode_len_mean |
+|------------------+-------+----------+--------------------|
+|          26.1973 | 16000 | 0.872034 |            13.7966 |
++------------------+-------+----------+--------------------+
 """
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
@@ -69,10 +114,10 @@ def custom_eval_function(algorithm: Algorithm, eval_workers: WorkerSet) -> Resul
     # processing.
     rollout_metrics = []
 
-    # For demonstration purposes, run through some arbitrary number of evaluation
-    # round within this one call. Note that this function is called once per
+    # For demonstration purposes, run through some number of evaluation
+    # rounds within this one call. Note that this function is called once per
     # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()`
-    # (which may be called manually by the user).
+    # (which can be called manually by the user).
     for i in range(3):
         print(f"Training iteration {algorithm.iteration} -> evaluation round {i}")
         # Sample episodes from the EnvRunners AND have them return only the thus

@@ -1,6 +1,72 @@
+"""Example showing how one can set up evaluation running in parallel to training.
+
+Such a setup saves a considerable amount of time during RL Algorithm training, b/c
+the next training step does NOT have to wait for the previous evaluation procedure to
+finish, but can already start running (in parallel).
+
+See RLlib's documentation for more details on the effect of the different supported
+evaluation configuration options:
+https://docs.ray.io/en/latest/rllib/rllib-advanced-api.html#customized-evaluation-during-training  # noqa
+
+For an example of how to write a fully customized evaluation function (which normally
+is not necessary as the config options are sufficient and offer maximum flexibility),
+see this example script here:
+
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py  # noqa
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--evaluation-num-workers` option to scale up the evaluation workers. Note
+that the requested evaluation duration (`--evaluation-duration` measured in
+`--evaluation-duration-unit`, which is either "timesteps" (default) or "episodes") is
+shared between all configured evaluation workers. For example, if the evaluation
+duration is 10 and the unit is "episodes" and you configured 5 workers, then each of the
+evaluation workers will run exactly 2 episodes.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following output (at the end of the experiment) in your console when
+running with a fixed number of 100k training timesteps
+(`--enable-new-api-stack --evaluation-duration=auto --stop-timesteps=100000
+--stop-reward=100000`):
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_1377a_00000 | TERMINATED | 127.0.0.1:73330 |     25 |
++-----------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|          71.7485 | 100000 |   476.51 |             476.51 |
++------------------+--------+----------+--------------------+
+
+When running without parallel evaluation (`--evaluation-not-parallel-to-training` flag),
+the experiment takes considerably longer (~70sec vs ~80sec):
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_f1788_00000 | TERMINATED | 127.0.0.1:75135 |     25 |
++-----------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|          81.7371 | 100000 |   494.68 |             494.68 |
++------------------+--------+----------+--------------------+
+"""
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
-from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner
-from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
 from ray.rllib.utils.test_utils import (
     add_rllib_example_script_args,
@@ -114,13 +180,10 @@ def on_train_result(self, *, algorithm, result, **kwargs):
             lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}),
         )
 
-    config = (
+    base_config = (
         get_trainable_cls(args.algo)
         .get_default_config()
-        .experimental(_enable_new_api_stack=args.enable_new_api_stack)
         .environment("env" if args.num_agents > 0 else "CartPole-v1")
-        # Run with tracing enabled for tf2.
-        .framework(args.framework)
         # Use a custom callback that asserts that we are running the
         # configured exact number of episodes per evaluation OR - in auto
         # mode - run at least as many episodes as we have eval workers.
@@ -148,28 +211,11 @@ def on_train_result(self, *, algorithm, result, **kwargs):
             # Switch off exploratory behavior for better (greedy) results.
             evaluation_config={"explore": False},
         )
-        .rollouts(
-            num_rollout_workers=args.num_env_runners,
-            # Set up the correct env-runner to use depending on
-            # old-stack/new-stack and multi-agent settings.
-            env_runner_cls=(
-                None
-                if not args.enable_new_api_stack
-                else SingleAgentEnvRunner
-                if args.num_agents == 0
-                else MultiAgentEnvRunner
-            ),
-        )
-        .resources(
-            num_learner_workers=args.num_gpus,
-            num_gpus_per_learner_worker=int(args.num_gpus != 0),
-            num_cpus_for_local_worker=1,
-        )
     )
 
     # Add a simple multi-agent setup.
     if args.num_agents > 0:
-        config.multi_agent(
+        base_config.multi_agent(
             policies={f"p{i}" for i in range(args.num_agents)},
             policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
         )
@@ -180,4 +226,11 @@ def on_train_result(self, *, algorithm, result, **kwargs):
         "timesteps_total": args.stop_timesteps,
     }
 
-    run_rllib_example_script_experiment(config, args, stop=stop)
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={
+            "evaluation/sampler_results/episode_reward_mean": args.stop_reward,
+        },
+    )