Skip to content

Commit

Permalink
[RLlib] QMIX better defaults + added to CI learning tests (#21332)
Browse files Browse the repository at this point in the history
  • Loading branch information
sven1977 authored Jan 4, 2022
1 parent 8cc2680 commit abd3bef
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 63 deletions.
47 changes: 34 additions & 13 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@
# Currently we have the following categories:

# - Learning tests/regression, tagged:
# -- "learning_tests_[tf|tf2|torch]": Distinguish tf/tf2 vs torch.
# -- "learning_tests_[discrete|continuous]_[tf|tf2|torch]": distinguish discrete
# actions vs continuous actions AND tf vs torch.
# -- "fake_gpus_[tf|torch]": Tests that run using 2 fake GPUs.
# -- "learning_tests_[discrete|continuous]": distinguish discrete
# actions vs continuous actions.
# -- "fake_gpus": Tests that run using 2 fake GPUs.

# - Quick agent compilation/tune-train tests, tagged "quick_train".
# NOTE: These should be obsoleted in favor of "trainers_dir" tests as
Expand Down Expand Up @@ -413,6 +412,37 @@ py_test(
args = ["--yaml-dir=tuned_examples/ppo"]
)

# QMIX
py_test(
name = "learning_tests_two_step_game_qmix",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/qmix/two-step-game-qmix.yaml"],
args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
)

py_test(
name = "learning_tests_two_step_game_qmix_vdn_mixer",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml"],
args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
)

py_test(
name = "learning_tests_two_step_game_qmix_no_mixer",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml"],
args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
)

# R2D2
py_test(
name = "learning_tests_stateless_cartpole_r2d2",
Expand Down Expand Up @@ -2683,15 +2713,6 @@ py_test(
args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=PG"]
)

py_test(
name = "examples/two_step_game_qmix",
main = "examples/two_step_game.py",
tags = ["team:ml", "examples", "examples_T"],
size = "large",
srcs = ["examples/two_step_game.py"],
args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=QMIX"]
)

py_test(
name = "contrib/bandits/examples/lin_ts",
main = "contrib/bandits/examples/simple_context_bandit.py",
Expand Down
5 changes: 3 additions & 2 deletions rllib/agents/qmix/qmix.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@
"type": "EpsilonGreedy",
# Config for the Exploration class' constructor:
"initial_epsilon": 1.0,
"final_epsilon": 0.02,
"epsilon_timesteps": 10000, # Timesteps over which to anneal epsilon.
"final_epsilon": 0.01,
# Timesteps over which to anneal epsilon.
"epsilon_timesteps": 40000,

# For soft_q, use:
# "exploration_config" = {
Expand Down
43 changes: 29 additions & 14 deletions rllib/agents/qmix/tests/test_qmix.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,42 @@ class AvailActionsTestEnv(MultiAgentEnv):

def __init__(self, env_config):
self.state = None
self.avail = env_config["avail_action"]
self.avail = env_config.get("avail_actions", [3])
self.action_mask = np.array([0] * 10)
self.action_mask[env_config["avail_action"]] = 1
for a in self.avail:
self.action_mask[a] = 1

def reset(self):
self.state = 0
return {
"agent_1": {
"obs": self.observation_space["obs"].sample(),
"action_mask": self.action_mask
}
},
"agent_2": {
"obs": self.observation_space["obs"].sample(),
"action_mask": self.action_mask
},
}

def step(self, action_dict):
if self.state > 0:
assert action_dict["agent_1"] == self.avail, \
assert (action_dict["agent_1"] in self.avail and
action_dict["agent_2"] in self.avail), \
"Failed to obey available actions mask!"
self.state += 1
rewards = {"agent_1": 1}
rewards = {"agent_1": 1, "agent_2": 0.5}
obs = {
"agent_1": {
"obs": self.observation_space["obs"].sample(),
"action_mask": self.action_mask
},
"agent_2": {
"obs": self.observation_space["obs"].sample(),
"action_mask": self.action_mask
}
}
dones = {"__all__": self.state > 20}
dones = {"__all__": self.state >= 20}
return obs, rewards, dones, {}


Expand All @@ -64,28 +74,33 @@ def tearDownClass(cls) -> None:

def test_avail_actions_qmix(self):
grouping = {
"group_1": ["agent_1"], # trivial grouping for testing
"group_1": ["agent_1", "agent_2"],
}
obs_space = Tuple([AvailActionsTestEnv.observation_space])
act_space = Tuple([AvailActionsTestEnv.action_space])
obs_space = Tuple([
AvailActionsTestEnv.observation_space,
AvailActionsTestEnv.observation_space
])
act_space = Tuple([
AvailActionsTestEnv.action_space, AvailActionsTestEnv.action_space
])
register_env(
"action_mask_test",
lambda config: AvailActionsTestEnv(config).with_agent_groups(
grouping, obs_space=obs_space, act_space=act_space))

agent = QMixTrainer(
trainer = QMixTrainer(
env="action_mask_test",
config={
"num_envs_per_worker": 5, # test with vectorization on
"env_config": {
"avail_action": 3,
"avail_actions": [3, 4, 8],
},
"framework": "torch",
})
for _ in range(4):
agent.train() # OK if it doesn't trip the action assertion error
assert agent.train()["episode_reward_mean"] == 21.0
agent.stop()
trainer.train() # OK if it doesn't trip the action assertion error
assert trainer.train()["episode_reward_mean"] == 30.0
trainer.stop()
ray.shutdown()


Expand Down
22 changes: 21 additions & 1 deletion rllib/examples/env/two_step_game.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from gym.spaces import MultiDiscrete, Dict, Discrete
from gym.spaces import Dict, Discrete, MultiDiscrete, Tuple
import numpy as np

from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE
Expand Down Expand Up @@ -109,3 +109,23 @@ def agent_2_obs(self):
return np.concatenate([self.state, [2]])
else:
return np.flatnonzero(self.state)[0] + 3


class TwoStepGameWithGroupedAgents(MultiAgentEnv):
def __init__(self, env_config):
env = TwoStepGame(env_config)
tuple_obs_space = Tuple([env.observation_space, env.observation_space])
tuple_act_space = Tuple([env.action_space, env.action_space])

self.env = env.with_agent_groups(
groups={"agents": [0, 1]},
obs_space=tuple_obs_space,
act_space=tuple_act_space)
self.observation_space = self.env.observation_space
self.action_space = self.env.action_space

def reset(self):
return self.env.reset()

def step(self, actions):
return self.env.step(actions)
20 changes: 11 additions & 9 deletions rllib/examples/two_step_game.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import ray
from ray import tune
from ray.tune import register_env, grid_search
from ray.tune import register_env
from ray.rllib.env.multi_agent_env import ENV_STATE
from ray.rllib.examples.env.two_step_game import TwoStepGame
from ray.rllib.policy.policy import PolicySpec
Expand All @@ -32,6 +32,12 @@
default="tf",
help="The DL framework specifier.")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument(
"--mixer",
type=str,
default="qmix",
choices=["qmix", "vdn", "none"],
help="The mixer model to use.")
parser.add_argument(
"--as-test",
action="store_true",
Expand All @@ -45,12 +51,12 @@
parser.add_argument(
"--stop-timesteps",
type=int,
default=50000,
default=70000,
help="Number of timesteps to train.")
parser.add_argument(
"--stop-reward",
type=float,
default=7.0,
default=8.0,
help="Reward at which we stop training.")
parser.add_argument(
"--local-mode",
Expand Down Expand Up @@ -116,11 +122,10 @@
"rollout_fragment_length": 4,
"train_batch_size": 32,
"exploration_config": {
"epsilon_timesteps": 5000,
"final_epsilon": 0.05,
"final_epsilon": 0.0,
},
"num_workers": 0,
"mixer": grid_search([None, "qmix"]),
"mixer": args.mixer,
"env_config": {
"separate_state_space": True,
"one_hot_state_encoding": True
Expand All @@ -147,9 +152,6 @@
"env": "grouped_twostep" if group else TwoStepGame,
})

if args.as_test:
config["seed"] = 1234

results = tune.run(args.run, stop=stop, config=config, verbose=2)

if args.as_test:
Expand Down
15 changes: 10 additions & 5 deletions rllib/tests/run_regression_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@
if __name__ == "__main__":
args = parser.parse_args()

# Error if deprecated --torch option used.
if args.torch:
deprecation_warning(old="--torch", new="--framework=torch", error=True)

# Bazel regression test mode: Get path to look for yaml files.
# Get the path or single file to use.
rllib_dir = Path(__file__).parent.parent
Expand Down Expand Up @@ -81,13 +85,14 @@
assert len(experiments) == 1,\
"Error, can only run a single experiment per yaml file!"

# Add torch option to exp config.
exp = list(experiments.values())[0]
exp["config"]["framework"] = args.framework
if args.torch:
deprecation_warning(old="--torch", new="--framework=torch")
exp["config"]["framework"] = "torch"
args.framework = "torch"

# QMIX does not support tf yet -> skip.
if exp["run"] == "QMIX" and args.framework != "torch":
print(f"Skipping framework='{args.framework}' for QMIX.")
continue

# Always run with eager-tracing when framework=tf2.
if args.framework in ["tf2", "tfe"]:
exp["config"]["eager_tracing"] = True
Expand Down
Loading

0 comments on commit abd3bef

Please sign in to comment.