ray-project · sven1977 · Jun 22, 2024 · Jun 20, 2024 · Jun 21, 2024 · Jun 21, 2024
@@ -333,6 +333,62 @@ py_test(
     srcs = ["tuned_examples/impala/cartpole_impala.py"],
     args = ["--as-test", "--enable-new-api-stack"]
 )
+py_test(
+    name = "learning_tests_cartpole_impala_gpu",
+    main = "tuned_examples/impala/cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/impala/cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_cartpole_impala_multi_cpu",
+    main = "tuned_examples/impala/cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/impala/cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_cartpole_impala_multi_gpu",
+    main = "tuned_examples/impala/cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/impala/cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_impala",
+    main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "torch_only"],
+    size = "large",
+    srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=5"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_impala_gpu",
+    main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1", "--num-cpus=6"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_impala_multi_cpu",
+    main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_impala_multi_gpu",
+    main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
+)
 
 #@OldAPIstack
 py_test(
@@ -346,18 +402,16 @@ py_test(
     ],
     args = ["--dir=tuned_examples/impala"]
 )
-
 #@OldAPIStack
 py_test(
     name = "learning_tests_multi_agent_cartpole_impala_old_api_stack",
     main = "tests/run_regression_tests.py",
     tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
     size = "medium",
     srcs = ["tests/run_regression_tests.py"],
-    data = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
+    data = ["tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py"],
     args = ["--dir=tuned_examples/impala"]
 )
-
 #@OldAPIStack
 py_test(
     name = "learning_tests_cartpole_impala_fake_gpus_old_api_stack",

@@ -39,7 +39,7 @@
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
 from ray.rllib.algorithms.registry import ALGORITHMS_CLASS_TO_NAME as ALL_ALGORITHMS
 from ray.rllib.connectors.agent.obs_preproc import ObsPreprocessorConnector
-from ray.rllib.core import DEFAULT_AGENT_ID, DEFAULT_MODULE_ID
+from ray.rllib.core import DEFAULT_MODULE_ID
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec
@@ -93,6 +93,7 @@
     ALL_MODULES,
     ENV_RUNNER_RESULTS,
     ENV_RUNNER_SAMPLING_TIMER,
+    EPISODE_LEN_MEAN,
     EPISODE_RETURN_MAX,
     EPISODE_RETURN_MEAN,
     EPISODE_RETURN_MIN,
@@ -273,12 +274,12 @@ class Algorithm(Trainable, AlgorithmBase):
     _override_all_key_list = ["off_policy_estimation_methods", "policies"]
 
     _progress_metrics = (
-        f"{ENV_RUNNER_RESULTS}/episode_return_mean",
-        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/episode_return_mean",
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
+        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
         f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}",
         f"{NUM_ENV_STEPS_TRAINED_LIFETIME}",
         f"{NUM_EPISODES_LIFETIME}",
-        f"{ENV_RUNNER_RESULTS}/episode_len_mean",
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_LEN_MEAN}",
     )
 
     @staticmethod
@@ -480,20 +481,6 @@ def __init__(
         # components (including timers, counters and other stats in its own
         # `training_step()` and other methods) as well as custom callbacks.
         self.metrics = MetricsLogger()
-        # Initialize lifetime counters (or those that are common as Tune stop criteria.
-        # We don't want tune to crash regularly b/c these stats might be still missing
-        # entirely after the first few iterations.
-        self.metrics.log_dict(
-            {
-                NUM_ENV_STEPS_SAMPLED_LIFETIME: 0,
-                NUM_AGENT_STEPS_SAMPLED_LIFETIME: {DEFAULT_AGENT_ID: 0},
-                NUM_ENV_STEPS_TRAINED_LIFETIME: 0,
-                NUM_AGENT_STEPS_TRAINED_LIFETIME: {DEFAULT_AGENT_ID: 0},
-                NUM_EPISODES_LIFETIME: 0,
-                f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": np.nan,
-            },
-            reduce="sum",
-        )
 
         # Create a default logger creator if no logger_creator is specified
         if logger_creator is None:
@@ -914,7 +901,7 @@ def step(self) -> ResultDict:
                     self.workers.sync_env_runner_states(
                         config=self.config,
                         env_steps_sampled=self.metrics.peek(
-                            NUM_ENV_STEPS_SAMPLED_LIFETIME
+                            NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
                         ),
                     )
             # Compile final ResultDict from `train_results` and `eval_results`. Note
@@ -3632,16 +3619,20 @@ def __enter__(self):
         self.trained = 0
         if self.algo.config.enable_env_runner_and_connector_v2:
             self.init_env_steps_sampled = self.algo.metrics.peek(
-                NUM_ENV_STEPS_SAMPLED_LIFETIME
+                NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
             )
             self.init_env_steps_trained = self.algo.metrics.peek(
-                NUM_ENV_STEPS_TRAINED_LIFETIME
+                NUM_ENV_STEPS_TRAINED_LIFETIME, default=0
             )
             self.init_agent_steps_sampled = sum(
-                self.algo.metrics.peek(NUM_AGENT_STEPS_SAMPLED_LIFETIME).values()
+                self.algo.metrics.peek(
+                    NUM_AGENT_STEPS_SAMPLED_LIFETIME, default={}
+                ).values()
             )
             self.init_agent_steps_trained = sum(
-                self.algo.metrics.peek(NUM_AGENT_STEPS_TRAINED_LIFETIME).values()
+                self.algo.metrics.peek(
+                    NUM_AGENT_STEPS_TRAINED_LIFETIME, default={}
+                ).values()
             )
         else:
             self.init_env_steps_sampled = self.algo._counters[NUM_ENV_STEPS_SAMPLED]
@@ -3681,26 +3672,26 @@ def should_stop(self, results):
                 self.sampled = (
                     sum(
                         self.algo.metrics.peek(
-                            NUM_AGENT_STEPS_SAMPLED_LIFETIME
+                            NUM_AGENT_STEPS_SAMPLED_LIFETIME, default={}
                         ).values()
                     )
                     - self.init_agent_steps_sampled
                 )
                 self.trained = (
                     sum(
                         self.algo.metrics.peek(
-                            NUM_AGENT_STEPS_TRAINED_LIFETIME
+                            NUM_AGENT_STEPS_TRAINED_LIFETIME, default={}
                         ).values()
                     )
                     - self.init_agent_steps_trained
                 )
             else:
                 self.sampled = (
-                    self.algo.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME)
+                    self.algo.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0)
                     - self.init_env_steps_sampled
                 )
                 self.trained = (
-                    self.algo.metrics.peek(NUM_ENV_STEPS_TRAINED_LIFETIME)
+                    self.algo.metrics.peek(NUM_ENV_STEPS_TRAINED_LIFETIME, default=0)
                     - self.init_env_steps_trained
                 )
         else:

@@ -1,43 +1,55 @@
-# @OldAPIStack
 from ray.rllib.algorithms.impala import ImpalaConfig
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
     NUM_ENV_STEPS_SAMPLED_LIFETIME,
 )
-from ray import tune
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+from ray.tune.registry import register_env
 
-tune.registry.register_env("env", lambda cfg: MultiAgentCartPole(config=cfg))
+parser = add_rllib_example_script_args()
+parser.set_defaults(num_agents=2, num_env_runners=4)
+# Use `parser` to add your own custom command line options to this script
+# and (if needed) use their values toset up `config` below.
+args = parser.parse_args()
+
+register_env("env", lambda cfg: MultiAgentCartPole(config=cfg))
 
 
 config = (
     ImpalaConfig()
-    .environment("env", env_config={"num_agents": 4})
-    .env_runners(
-        num_envs_per_env_runner=5,
-        num_env_runners=4,
-        observation_filter="MeanStdFilter",
-    )
-    .resources(num_gpus=1, _fake_gpus=True)
-    .multi_agent(
-        policies=["p0", "p1", "p2", "p3"],
-        policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"),
+    .api_stack(
+        enable_rl_module_and_learner=True,
+        enable_env_runner_and_connector_v2=True,
     )
+    .environment("env", env_config={"num_agents": args.num_agents})
     .training(
-        num_sgd_iter=1,
-        vf_loss_coeff=0.005,
-        vtrace=True,
-        model={
-            "fcnet_hiddens": [32],
-            "fcnet_activation": "linear",
+        train_batch_size_per_learner=750,
+        grad_clip=40.0,
+        grad_clip_by="global_norm",
+        lr=0.00075,
+        vf_loss_coeff=0.01,
+    )
+    .rl_module(
+        model_config_dict={
             "vf_share_layers": True,
+            "uses_new_env_runners": True,
         },
-        replay_proportion=0.0,
+    )
+    .multi_agent(
+        policy_mapping_fn=(lambda agent_id, episode, **kwargs: f"p{agent_id}"),
+        policies={f"p{i}" for i in range(args.num_agents)},
     )
 )
 
 stop = {
-    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 600,  # 600 / 4 (==num_agents) = 150
-    f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000,
+    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 400.0 * args.num_agents,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME: 2000000,
 }
+
+
+if __name__ == "__main__":
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+    run_rllib_example_script_experiment(config, args, stop=stop)
@@ -0,0 +1,43 @@
+# @OldAPIStack
+from ray.rllib.algorithms.impala import ImpalaConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray import tune
+
+tune.registry.register_env("env", lambda cfg: MultiAgentCartPole(config=cfg))
+
+
+config = (
+    ImpalaConfig()
+    .environment("env", env_config={"num_agents": 4})
+    .env_runners(
+        num_envs_per_env_runner=5,
+        num_env_runners=4,
+        observation_filter="MeanStdFilter",
+    )
+    .resources(num_gpus=1, _fake_gpus=True)
+    .multi_agent(
+        policies=["p0", "p1", "p2", "p3"],
+        policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"),
+    )
+    .training(
+        num_sgd_iter=1,
+        vf_loss_coeff=0.005,
+        vtrace=True,
+        model={
+            "fcnet_hiddens": [32],
+            "fcnet_activation": "linear",
+            "vf_share_layers": True,
+        },
+        replay_proportion=0.0,
+    )
+)
+
+stop = {
+    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 600,  # 600 / 4 (==num_agents) = 150
+    f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000,
+}
@@ -4,6 +4,11 @@
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
 from ray.rllib.examples.rl_modules.classes.tiny_atari_cnn import TinyAtariCNN
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 from ray.tune.registry import register_env
 
@@ -82,8 +87,8 @@ def _env_creator(cfg):
 )
 
 stop = {
-    "env_runner_results/episode_return_mean": 20.0,
-    "num_env_steps_sampled_lifetime": 5000000,
+    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 20.0,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME: 5000000,
 }