Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] New API stack: Add systematic IMPALA learning tests for [CartPole|Pendulum] | [CPU|GPU|multi-CPU|multi-GPU] | [single- and multi-agent]. #46162

Merged
merged 11 commits into from
Jun 22, 2024
60 changes: 57 additions & 3 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,62 @@ py_test(
srcs = ["tuned_examples/impala/cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack"]
)
py_test(
name = "learning_tests_cartpole_impala_gpu",
main = "tuned_examples/impala/cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
size = "large",
srcs = ["tuned_examples/impala/cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
)
py_test(
name = "learning_tests_cartpole_impala_multi_cpu",
main = "tuned_examples/impala/cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/impala/cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
py_test(
name = "learning_tests_cartpole_impala_multi_gpu",
main = "tuned_examples/impala/cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/impala/cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
py_test(
name = "learning_tests_multi_agent_cartpole_impala",
main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "torch_only"],
size = "large",
srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=5"]
)
py_test(
name = "learning_tests_multi_agent_cartpole_impala_gpu",
main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
size = "large",
srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1", "--num-cpus=6"]
)
py_test(
name = "learning_tests_multi_agent_cartpole_impala_multi_cpu",
main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
)
py_test(
name = "learning_tests_multi_agent_cartpole_impala_multi_gpu",
main = "tuned_examples/impala/multi_agent_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
)

#@OldAPIstack
py_test(
Expand All @@ -346,18 +402,16 @@ py_test(
],
args = ["--dir=tuned_examples/impala"]
)

#@OldAPIStack
py_test(
name = "learning_tests_multi_agent_cartpole_impala_old_api_stack",
main = "tests/run_regression_tests.py",
tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
data = ["tuned_examples/impala/multi_agent_cartpole_impala_old_api_stack.py"],
args = ["--dir=tuned_examples/impala"]
)

#@OldAPIStack
py_test(
name = "learning_tests_cartpole_impala_fake_gpus_old_api_stack",
Expand Down
45 changes: 18 additions & 27 deletions rllib/algorithms/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from ray.rllib.algorithms.registry import ALGORITHMS_CLASS_TO_NAME as ALL_ALGORITHMS
from ray.rllib.connectors.agent.obs_preproc import ObsPreprocessorConnector
from ray.rllib.core import DEFAULT_AGENT_ID, DEFAULT_MODULE_ID
from ray.rllib.core import DEFAULT_MODULE_ID
from ray.rllib.core.columns import Columns
from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec
Expand Down Expand Up @@ -93,6 +93,7 @@
ALL_MODULES,
ENV_RUNNER_RESULTS,
ENV_RUNNER_SAMPLING_TIMER,
EPISODE_LEN_MEAN,
EPISODE_RETURN_MAX,
EPISODE_RETURN_MEAN,
EPISODE_RETURN_MIN,
Expand Down Expand Up @@ -273,12 +274,12 @@ class Algorithm(Trainable, AlgorithmBase):
_override_all_key_list = ["off_policy_estimation_methods", "policies"]

_progress_metrics = (
f"{ENV_RUNNER_RESULTS}/episode_return_mean",
f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/episode_return_mean",
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}",
f"{NUM_ENV_STEPS_TRAINED_LIFETIME}",
f"{NUM_EPISODES_LIFETIME}",
f"{ENV_RUNNER_RESULTS}/episode_len_mean",
f"{ENV_RUNNER_RESULTS}/{EPISODE_LEN_MEAN}",
)

@staticmethod
Expand Down Expand Up @@ -480,20 +481,6 @@ def __init__(
# components (including timers, counters and other stats in its own
# `training_step()` and other methods) as well as custom callbacks.
self.metrics = MetricsLogger()
# Initialize lifetime counters (or those that are common as Tune stop criteria.
# We don't want tune to crash regularly b/c these stats might be still missing
# entirely after the first few iterations.
self.metrics.log_dict(
{
NUM_ENV_STEPS_SAMPLED_LIFETIME: 0,
NUM_AGENT_STEPS_SAMPLED_LIFETIME: {DEFAULT_AGENT_ID: 0},
NUM_ENV_STEPS_TRAINED_LIFETIME: 0,
NUM_AGENT_STEPS_TRAINED_LIFETIME: {DEFAULT_AGENT_ID: 0},
NUM_EPISODES_LIFETIME: 0,
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": np.nan,
},
reduce="sum",
)

# Create a default logger creator if no logger_creator is specified
if logger_creator is None:
Expand Down Expand Up @@ -914,7 +901,7 @@ def step(self) -> ResultDict:
self.workers.sync_env_runner_states(
config=self.config,
env_steps_sampled=self.metrics.peek(
NUM_ENV_STEPS_SAMPLED_LIFETIME
NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
),
)
# Compile final ResultDict from `train_results` and `eval_results`. Note
Expand Down Expand Up @@ -3632,16 +3619,20 @@ def __enter__(self):
self.trained = 0
if self.algo.config.enable_env_runner_and_connector_v2:
self.init_env_steps_sampled = self.algo.metrics.peek(
NUM_ENV_STEPS_SAMPLED_LIFETIME
NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
)
self.init_env_steps_trained = self.algo.metrics.peek(
NUM_ENV_STEPS_TRAINED_LIFETIME
NUM_ENV_STEPS_TRAINED_LIFETIME, default=0
)
self.init_agent_steps_sampled = sum(
self.algo.metrics.peek(NUM_AGENT_STEPS_SAMPLED_LIFETIME).values()
self.algo.metrics.peek(
NUM_AGENT_STEPS_SAMPLED_LIFETIME, default={}
).values()
)
self.init_agent_steps_trained = sum(
self.algo.metrics.peek(NUM_AGENT_STEPS_TRAINED_LIFETIME).values()
self.algo.metrics.peek(
NUM_AGENT_STEPS_TRAINED_LIFETIME, default={}
).values()
)
else:
self.init_env_steps_sampled = self.algo._counters[NUM_ENV_STEPS_SAMPLED]
Expand Down Expand Up @@ -3681,26 +3672,26 @@ def should_stop(self, results):
self.sampled = (
sum(
self.algo.metrics.peek(
NUM_AGENT_STEPS_SAMPLED_LIFETIME
NUM_AGENT_STEPS_SAMPLED_LIFETIME, default={}
).values()
)
- self.init_agent_steps_sampled
)
self.trained = (
sum(
self.algo.metrics.peek(
NUM_AGENT_STEPS_TRAINED_LIFETIME
NUM_AGENT_STEPS_TRAINED_LIFETIME, default={}
).values()
)
- self.init_agent_steps_trained
)
else:
self.sampled = (
self.algo.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME)
self.algo.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0)
- self.init_env_steps_sampled
)
self.trained = (
self.algo.metrics.peek(NUM_ENV_STEPS_TRAINED_LIFETIME)
self.algo.metrics.peek(NUM_ENV_STEPS_TRAINED_LIFETIME, default=0)
- self.init_env_steps_trained
)
else:
Expand Down
56 changes: 34 additions & 22 deletions rllib/tuned_examples/impala/multi_agent_cartpole_impala.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,55 @@
# @OldAPIStack
from ray.rllib.algorithms.impala import ImpalaConfig
from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
NUM_ENV_STEPS_SAMPLED_LIFETIME,
)
from ray import tune
from ray.rllib.utils.test_utils import add_rllib_example_script_args
from ray.tune.registry import register_env

tune.registry.register_env("env", lambda cfg: MultiAgentCartPole(config=cfg))
parser = add_rllib_example_script_args()
parser.set_defaults(num_agents=2, num_env_runners=4)
# Use `parser` to add your own custom command line options to this script
# and (if needed) use their values toset up `config` below.
args = parser.parse_args()

register_env("env", lambda cfg: MultiAgentCartPole(config=cfg))


config = (
ImpalaConfig()
.environment("env", env_config={"num_agents": 4})
.env_runners(
num_envs_per_env_runner=5,
num_env_runners=4,
observation_filter="MeanStdFilter",
)
.resources(num_gpus=1, _fake_gpus=True)
.multi_agent(
policies=["p0", "p1", "p2", "p3"],
policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"),
.api_stack(
enable_rl_module_and_learner=True,
enable_env_runner_and_connector_v2=True,
)
.environment("env", env_config={"num_agents": args.num_agents})
.training(
num_sgd_iter=1,
vf_loss_coeff=0.005,
vtrace=True,
model={
"fcnet_hiddens": [32],
"fcnet_activation": "linear",
train_batch_size_per_learner=750,
grad_clip=40.0,
grad_clip_by="global_norm",
lr=0.00075,
vf_loss_coeff=0.01,
)
.rl_module(
model_config_dict={
"vf_share_layers": True,
"uses_new_env_runners": True,
},
replay_proportion=0.0,
)
.multi_agent(
policy_mapping_fn=(lambda agent_id, episode, **kwargs: f"p{agent_id}"),
policies={f"p{i}" for i in range(args.num_agents)},
)
)

stop = {
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 600, # 600 / 4 (==num_agents) = 150
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000,
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 400.0 * args.num_agents,
NUM_ENV_STEPS_SAMPLED_LIFETIME: 2000000,
}


if __name__ == "__main__":
from ray.rllib.utils.test_utils import run_rllib_example_script_experiment

run_rllib_example_script_experiment(config, args, stop=stop)

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# @OldAPIStack
from ray.rllib.algorithms.impala import ImpalaConfig
from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
NUM_ENV_STEPS_SAMPLED_LIFETIME,
)
from ray import tune

tune.registry.register_env("env", lambda cfg: MultiAgentCartPole(config=cfg))


config = (
ImpalaConfig()
.environment("env", env_config={"num_agents": 4})
.env_runners(
num_envs_per_env_runner=5,
num_env_runners=4,
observation_filter="MeanStdFilter",
)
.resources(num_gpus=1, _fake_gpus=True)
.multi_agent(
policies=["p0", "p1", "p2", "p3"],
policy_mapping_fn=(lambda agent_id, episode, worker, **kwargs: f"p{agent_id}"),
)
.training(
num_sgd_iter=1,
vf_loss_coeff=0.005,
vtrace=True,
model={
"fcnet_hiddens": [32],
"fcnet_activation": "linear",
"vf_share_layers": True,
},
replay_proportion=0.0,
)
)

stop = {
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 600, # 600 / 4 (==num_agents) = 150
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000,
}
9 changes: 7 additions & 2 deletions rllib/tuned_examples/impala/pong_impala.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
from ray.rllib.examples.rl_modules.classes.tiny_atari_cnn import TinyAtariCNN
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
NUM_ENV_STEPS_SAMPLED_LIFETIME,
)
from ray.rllib.utils.test_utils import add_rllib_example_script_args
from ray.tune.registry import register_env

Expand Down Expand Up @@ -82,8 +87,8 @@ def _env_creator(cfg):
)

stop = {
"env_runner_results/episode_return_mean": 20.0,
"num_env_steps_sampled_lifetime": 5000000,
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 20.0,
NUM_ENV_STEPS_SAMPLED_LIFETIME: 5000000,
}


Expand Down
Loading
Loading