[RLlib] QMIX better defaults + added to CI learning tests (#21332)

ray-project · Jan 4, 2022 · abd3bef · abd3bef
1 parent 8cc2680
commit abd3bef
Show file tree

Hide file tree

Showing 10 changed files with 194 additions and 63 deletions.
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -11,10 +11,9 @@
 # Currently we have the following categories:
 
 # - Learning tests/regression, tagged:
-# -- "learning_tests_[tf|tf2|torch]": Distinguish tf/tf2 vs torch.
-# -- "learning_tests_[discrete|continuous]_[tf|tf2|torch]": distinguish discrete
-#    actions vs continuous actions AND tf vs torch.
-# -- "fake_gpus_[tf|torch]": Tests that run using 2 fake GPUs.
+# -- "learning_tests_[discrete|continuous]": distinguish discrete
+#    actions vs continuous actions.
+# -- "fake_gpus": Tests that run using 2 fake GPUs.
 
 # - Quick agent compilation/tune-train tests, tagged "quick_train".
 #   NOTE: These should be obsoleted in favor of "trainers_dir" tests as
@@ -413,6 +412,37 @@ py_test(
     args = ["--yaml-dir=tuned_examples/ppo"]
 )
 
+# QMIX
+py_test(
+    name = "learning_tests_two_step_game_qmix",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/qmix/two-step-game-qmix.yaml"],
+    args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
+)
+
+py_test(
+    name = "learning_tests_two_step_game_qmix_vdn_mixer",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml"],
+    args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
+)
+
+py_test(
+    name = "learning_tests_two_step_game_qmix_no_mixer",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml"],
+    args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
+)
+
 # R2D2
 py_test(
     name = "learning_tests_stateless_cartpole_r2d2",
@@ -2683,15 +2713,6 @@ py_test(
     args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=PG"]
 )
 
-py_test(
-    name = "examples/two_step_game_qmix",
-    main = "examples/two_step_game.py",
-    tags = ["team:ml", "examples", "examples_T"],
-    size = "large",
-    srcs = ["examples/two_step_game.py"],
-    args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=QMIX"]
-)
-
 py_test(
     name = "contrib/bandits/examples/lin_ts",
     main = "contrib/bandits/examples/simple_context_bandit.py",

diff --git a/rllib/agents/qmix/qmix.py b/rllib/agents/qmix/qmix.py
@@ -34,8 +34,9 @@
         "type": "EpsilonGreedy",
         # Config for the Exploration class' constructor:
         "initial_epsilon": 1.0,
-        "final_epsilon": 0.02,
-        "epsilon_timesteps": 10000,  # Timesteps over which to anneal epsilon.
+        "final_epsilon": 0.01,
+        # Timesteps over which to anneal epsilon.
+        "epsilon_timesteps": 40000,
 
         # For soft_q, use:
         # "exploration_config" = {

diff --git a/rllib/agents/qmix/tests/test_qmix.py b/rllib/agents/qmix/tests/test_qmix.py
@@ -24,32 +24,42 @@ class AvailActionsTestEnv(MultiAgentEnv):
 
     def __init__(self, env_config):
         self.state = None
-        self.avail = env_config["avail_action"]
+        self.avail = env_config.get("avail_actions", [3])
         self.action_mask = np.array([0] * 10)
-        self.action_mask[env_config["avail_action"]] = 1
+        for a in self.avail:
+            self.action_mask[a] = 1
 
     def reset(self):
         self.state = 0
         return {
             "agent_1": {
                 "obs": self.observation_space["obs"].sample(),
                 "action_mask": self.action_mask
-            }
+            },
+            "agent_2": {
+                "obs": self.observation_space["obs"].sample(),
+                "action_mask": self.action_mask
+            },
         }
 
     def step(self, action_dict):
         if self.state > 0:
-            assert action_dict["agent_1"] == self.avail, \
+            assert (action_dict["agent_1"] in self.avail and
+                    action_dict["agent_2"] in self.avail), \
                 "Failed to obey available actions mask!"
         self.state += 1
-        rewards = {"agent_1": 1}
+        rewards = {"agent_1": 1, "agent_2": 0.5}
         obs = {
             "agent_1": {
                 "obs": self.observation_space["obs"].sample(),
                 "action_mask": self.action_mask
+            },
+            "agent_2": {
+                "obs": self.observation_space["obs"].sample(),
+                "action_mask": self.action_mask
             }
         }
-        dones = {"__all__": self.state > 20}
+        dones = {"__all__": self.state >= 20}
         return obs, rewards, dones, {}
 
 
@@ -64,28 +74,33 @@ def tearDownClass(cls) -> None:
 
     def test_avail_actions_qmix(self):
         grouping = {
-            "group_1": ["agent_1"],  # trivial grouping for testing
+            "group_1": ["agent_1", "agent_2"],
         }
-        obs_space = Tuple([AvailActionsTestEnv.observation_space])
-        act_space = Tuple([AvailActionsTestEnv.action_space])
+        obs_space = Tuple([
+            AvailActionsTestEnv.observation_space,
+            AvailActionsTestEnv.observation_space
+        ])
+        act_space = Tuple([
+            AvailActionsTestEnv.action_space, AvailActionsTestEnv.action_space
+        ])
         register_env(
             "action_mask_test",
             lambda config: AvailActionsTestEnv(config).with_agent_groups(
                 grouping, obs_space=obs_space, act_space=act_space))
 
-        agent = QMixTrainer(
+        trainer = QMixTrainer(
             env="action_mask_test",
             config={
                 "num_envs_per_worker": 5,  # test with vectorization on
                 "env_config": {
-                    "avail_action": 3,
+                    "avail_actions": [3, 4, 8],
                 },
                 "framework": "torch",
             })
         for _ in range(4):
-            agent.train()  # OK if it doesn't trip the action assertion error
-        assert agent.train()["episode_reward_mean"] == 21.0
-        agent.stop()
+            trainer.train()  # OK if it doesn't trip the action assertion error
+        assert trainer.train()["episode_reward_mean"] == 30.0
+        trainer.stop()
         ray.shutdown()
 
 

diff --git a/rllib/examples/env/two_step_game.py b/rllib/examples/env/two_step_game.py
@@ -1,4 +1,4 @@
-from gym.spaces import MultiDiscrete, Dict, Discrete
+from gym.spaces import Dict, Discrete, MultiDiscrete, Tuple
 import numpy as np
 
 from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE
@@ -109,3 +109,23 @@ def agent_2_obs(self):
             return np.concatenate([self.state, [2]])
         else:
             return np.flatnonzero(self.state)[0] + 3
+
+
+class TwoStepGameWithGroupedAgents(MultiAgentEnv):
+    def __init__(self, env_config):
+        env = TwoStepGame(env_config)
+        tuple_obs_space = Tuple([env.observation_space, env.observation_space])
+        tuple_act_space = Tuple([env.action_space, env.action_space])
+
+        self.env = env.with_agent_groups(
+            groups={"agents": [0, 1]},
+            obs_space=tuple_obs_space,
+            act_space=tuple_act_space)
+        self.observation_space = self.env.observation_space
+        self.action_space = self.env.action_space
+
+    def reset(self):
+        return self.env.reset()
+
+    def step(self, actions):
+        return self.env.step(actions)
diff --git a/rllib/examples/two_step_game.py b/rllib/examples/two_step_game.py
@@ -14,7 +14,7 @@
 
 import ray
 from ray import tune
-from ray.tune import register_env, grid_search
+from ray.tune import register_env
 from ray.rllib.env.multi_agent_env import ENV_STATE
 from ray.rllib.examples.env.two_step_game import TwoStepGame
 from ray.rllib.policy.policy import PolicySpec
@@ -32,6 +32,12 @@
     default="tf",
     help="The DL framework specifier.")
 parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument(
+    "--mixer",
+    type=str,
+    default="qmix",
+    choices=["qmix", "vdn", "none"],
+    help="The mixer model to use.")
 parser.add_argument(
     "--as-test",
     action="store_true",
@@ -45,12 +51,12 @@
 parser.add_argument(
     "--stop-timesteps",
     type=int,
-    default=50000,
+    default=70000,
     help="Number of timesteps to train.")
 parser.add_argument(
     "--stop-reward",
     type=float,
-    default=7.0,
+    default=8.0,
     help="Reward at which we stop training.")
 parser.add_argument(
     "--local-mode",
@@ -116,11 +122,10 @@
             "rollout_fragment_length": 4,
             "train_batch_size": 32,
             "exploration_config": {
-                "epsilon_timesteps": 5000,
-                "final_epsilon": 0.05,
+                "final_epsilon": 0.0,
             },
             "num_workers": 0,
-            "mixer": grid_search([None, "qmix"]),
+            "mixer": args.mixer,
             "env_config": {
                 "separate_state_space": True,
                 "one_hot_state_encoding": True
@@ -147,9 +152,6 @@
         "env": "grouped_twostep" if group else TwoStepGame,
     })
 
-    if args.as_test:
-        config["seed"] = 1234
-
     results = tune.run(args.run, stop=stop, config=config, verbose=2)
 
     if args.as_test:

diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py
@@ -53,6 +53,10 @@
 if __name__ == "__main__":
     args = parser.parse_args()
 
+    # Error if deprecated --torch option used.
+    if args.torch:
+        deprecation_warning(old="--torch", new="--framework=torch", error=True)
+
     # Bazel regression test mode: Get path to look for yaml files.
     # Get the path or single file to use.
     rllib_dir = Path(__file__).parent.parent
@@ -81,13 +85,14 @@
         assert len(experiments) == 1,\
             "Error, can only run a single experiment per yaml file!"
 
-        # Add torch option to exp config.
         exp = list(experiments.values())[0]
         exp["config"]["framework"] = args.framework
-        if args.torch:
-            deprecation_warning(old="--torch", new="--framework=torch")
-            exp["config"]["framework"] = "torch"
-            args.framework = "torch"
+
+        # QMIX does not support tf yet -> skip.
+        if exp["run"] == "QMIX" and args.framework != "torch":
+            print(f"Skipping framework='{args.framework}' for QMIX.")
+            continue
+
         # Always run with eager-tracing when framework=tf2.
         if args.framework in ["tf2", "tfe"]:
             exp["config"]["eager_tracing"] = True