Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Fix SAC/DQN/CQL GPU and multi-GPU. #47179

Merged
merged 21 commits into from
Aug 19, 2024
2 changes: 1 addition & 1 deletion doc/source/rllib/rllib-algorithms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ as well as multi-GPU training on multi-node (GPU) clusters when using the `Anysc
+-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
| :ref:`DQN/Rainbow (Deep Q Networks) <dqn>` | |single_agent| |multi_agent| | |multi_gpu| |multi_node_multi_gpu| | |discr_actions| |
+-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
| :ref:`SAC (Soft Actor Critic) <sac>` | |single_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| |
| :ref:`SAC (Soft Actor Critic) <sac>` | |single_agent| |multi_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| |
+-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
| **High-throughput on- and off policy** |
+-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
Expand Down
36 changes: 36 additions & 0 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2790,6 +2790,42 @@
cluster:
cluster_compute: 8gpus_96cpus_gce.yaml


# --------------------------
# SAC
# --------------------------
- name: rllib_learning_tests_halfcheetah_sac_torch
group: RLlib tests
working_dir: rllib_tests

stable: true

frequency: nightly
team: rllib
cluster:
byod:
type: gpu
post_build_script: byod_rllib_test.sh
runtime_env:
- RLLIB_TEST_NO_JAX_IMPORT=1
- LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
cluster_compute: 4gpus_64cpus.yaml

run:
timeout: 7200
script: python learning_tests/tuned_examples/sac/halfcheetah_sac.py --enable-new-api-stack --num-gpus=4 --num-env-runners=8 --stop-reward=1000.0 --as-release-test

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: 4gpus_64cpus_gce.yaml


########################
# Core Nightly Tests
########################
Expand Down
100 changes: 88 additions & 12 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,30 @@ py_test(
srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack"]
)
py_test(
name = "learning_tests_cartpole_dqn_gpu",
main = "tuned_examples/dqn/cartpole_dqn.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
size = "large",
srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does num-gpus=1 use a local or remote learner? Imo, we should test with both. What do you think @sven1977 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For IMPALA/APPO, we should add a validation that these should never be run with a local Learner, b/c these are async algos that suffer tremendously from having the Learner not-async. Will add this check/error in a separate PR ...

)
py_test(
name = "learning_tests_cartpole_dqn_multi_cpu",
main = "tuned_examples/dqn/cartpole_dqn.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
py_test(
name = "learning_tests_cartpole_dqn_multi_gpu",
main = "tuned_examples/dqn/cartpole_dqn.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# MultiAgentCartPole
py_test(
name = "learning_tests_multi_agent_cartpole_dqn",
Expand All @@ -358,16 +382,29 @@ py_test(
srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4"]
)

#@OldAPIStack
py_test(
name = "learning_tests_cartpole_dqn_softq_old_api_stack",
main = "tests/run_regression_tests.py",
tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"],
size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/dqn/cartpole-dqn-softq.yaml"],
args = ["--dir=tuned_examples/dqn"]
name = "learning_tests_multi_agent_cartpole_dqn_gpu",
main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
size = "large",
srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=1"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, I thought this does not work --num-gpus > 0 and --num-cpus > 0 :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. We need to get rid of this confusion some time soon. Note that these are the command line options, not directly translatable to Algo config properties:
Here:
--num-cpus are the ray provided CPUs for the entire cluster.
--num-gpus are the number of Learner workers; note that if no GPUs are available, --num-gpus still sets the number of Learner workers, but then each worker gets one CPU (instead of 1 GPU). :|

)
py_test(
name = "learning_tests_multi_agent_cartpole_dqn_multi_cpu",
main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=5", "--num-gpus=2"]
)
py_test(
name = "learning_tests_multi_agent_cartpole_dqn_multi_gpu",
main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=2"]
)

# IMPALA
Expand Down Expand Up @@ -669,7 +706,31 @@ py_test(
srcs = ["tuned_examples/sac/pendulum_sac.py"],
args = ["--as-test", "--enable-new-api-stack"]
)

py_test(
name = "learning_tests_pendulum_sac_gpu",
main = "tuned_examples/sac/pendulum_sac.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "gpu"],
size = "large",
srcs = ["tuned_examples/sac/pendulum_sac.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
)
py_test(
name = "learning_tests_pendulum_sac_multi_cpu",
main = "tuned_examples/sac/pendulum_sac.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous"],
size = "large",
srcs = ["tuned_examples/sac/pendulum_sac.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
py_test(
name = "learning_tests_pendulum_sac_multi_gpu",
main = "tuned_examples/sac/pendulum_sac.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/sac/pendulum_sac.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# MultiAgentPendulum
py_test(
name = "learning_tests_multi_agent_pendulum_sac",
main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
Expand All @@ -678,7 +739,22 @@ py_test(
srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4"]
)

py_test(
name = "learning_tests_multi_agent_pendulum_sac_gpu",
main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "gpu"],
size = "large",
srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=1"]
)
py_test(
name = "learning_tests_multi_agent_pendulum_sac_multi_cpu",
main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous"],
size = "large",
srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually need the srcs for files that can be executed directly via python?

args = ["--enable-new-api-stack", "--num-agents=2", "--num-gpus=2"]
)
py_test(
name = "learning_tests_multi_agent_pendulum_sac_multi_gpu",
main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
Expand Down Expand Up @@ -3240,7 +3316,7 @@ py_test(
name = "examples/rl_modules/custom_lstm_rl_module",
main = "examples/rl_modules/custom_lstm_rl_module.py",
tags = ["team:rllib", "examples"],
size = "medium",
size = "large",
srcs = ["examples/rl_modules/custom_lstm_rl_module.py"],
args = ["--as-test", "--enable-new-api-stack"],
)
Expand Down
32 changes: 16 additions & 16 deletions rllib/algorithms/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,7 @@
ENV_RUNNER_RESULTS,
ENV_RUNNER_SAMPLING_TIMER,
EPISODE_LEN_MEAN,
EPISODE_RETURN_MAX,
EPISODE_RETURN_MEAN,
EPISODE_RETURN_MIN,
EVALUATION_ITERATION_TIMER,
EVALUATION_RESULTS,
FAULT_TOLERANCE_STATS,
Expand Down Expand Up @@ -1701,7 +1699,7 @@ def training_step(self) -> ResultDict:
if self.config.count_steps_by == "agent_steps":
train_batch, env_runner_results = synchronous_parallel_sample(
worker_set=self.env_runner_group,
max_agent_steps=self.config.train_batch_size,
max_agent_steps=self.config.total_train_batch_size,
sample_timeout_s=self.config.sample_timeout_s,
_uses_new_env_runners=(
self.config.enable_env_runner_and_connector_v2
Expand All @@ -1711,7 +1709,7 @@ def training_step(self) -> ResultDict:
else:
train_batch, env_runner_results = synchronous_parallel_sample(
worker_set=self.env_runner_group,
max_env_steps=self.config.train_batch_size,
max_env_steps=self.config.total_train_batch_size,
sample_timeout_s=self.config.sample_timeout_s,
_uses_new_env_runners=(
self.config.enable_env_runner_and_connector_v2
Expand Down Expand Up @@ -3846,21 +3844,23 @@ def _compile_iteration_results_new_api_stack(
# Return dict (shallow copy of `train_results`).
results: ResultDict = train_results.copy()

# TODO (sven): Fix Tune, instead, to be tolerant against possibly missing result
# keys. Otherwise, we'll have to guess here, what "popular" keys users use in
# order to protect them from running into Tune KeyErrors.
if ENV_RUNNER_RESULTS not in results:
results[ENV_RUNNER_RESULTS] = {}
for must_have in [
EPISODE_RETURN_MEAN,
EPISODE_RETURN_MIN,
EPISODE_RETURN_MAX,
]:
if must_have not in results[ENV_RUNNER_RESULTS]:
results[ENV_RUNNER_RESULTS][must_have] = np.nan
# Collect old-API-stack-style `self._timers` results.
for k, timer in self._timers.items():
if TIMERS not in results:
results[TIMERS] = {}
results[TIMERS]["{}_time_sec".format(k)] = timer.mean
if timer.has_units_processed():
results[TIMERS]["{}_throughput".format(k)] = round(
timer.mean_throughput, 3
)

# Evaluation results.
if eval_results:
assert (
isinstance(eval_results, dict)
and len(eval_results) == 1
and EVALUATION_RESULTS in eval_results
)
results.update(eval_results)
# Fault tolerance stats.
results[FAULT_TOLERANCE_STATS] = {
Expand Down
2 changes: 1 addition & 1 deletion rllib/algorithms/algorithm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,9 @@ def __init__(self, algo_class: Optional[type] = None):
self.lr = 0.001
self.grad_clip = None
self.grad_clip_by = "global_norm"
self.train_batch_size = 32
# Simple logic for now: If None, use `train_batch_size`.
self.train_batch_size_per_learner = None
self.train_batch_size = 32 # @OldAPIStack
# TODO (sven): Unsolved problem with RLModules sometimes requiring settings from
# the main AlgorithmConfig. We should not require the user to provide those
# settings in both, the AlgorithmConfig (as property) AND the model config
Expand Down
27 changes: 16 additions & 11 deletions rllib/algorithms/dqn/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
NUM_MODULE_STEPS_TRAINED,
NUM_MODULE_STEPS_TRAINED_LIFETIME,
NUM_TARGET_UPDATES,
REPLAY_BUFFER_ADD_DATA_TIMER,
REPLAY_BUFFER_SAMPLE_TIMER,
REPLAY_BUFFER_UPDATE_PRIOS_TIMER,
SAMPLE_TIMER,
Expand Down Expand Up @@ -556,7 +557,7 @@ def calculate_rr_weights(config: AlgorithmConfig) -> List[float]:
# This is to set freshly rollout-collected data in relation to
# the data we pull from the replay buffer (which also contains old
# samples).
native_ratio = config.train_batch_size / (
native_ratio = config.total_train_batch_size / (
config.get_rollout_fragment_length()
* config.num_envs_per_env_runner
# Add one to workers because the local
Expand Down Expand Up @@ -628,13 +629,15 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
_uses_new_env_runners=True,
_return_metrics=True,
)
# Add the sampled experiences to the replay buffer.
self.local_replay_buffer.add(episodes)
# Reduce EnvRunner metrics over the n EnvRunners.
self.metrics.merge_and_log_n_dicts(
env_runner_results, key=ENV_RUNNER_RESULTS
)

# Add the sampled experiences to the replay buffer.
with self.metrics.log_time((TIMERS, REPLAY_BUFFER_ADD_DATA_TIMER)):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice :)

self.local_replay_buffer.add(episodes)

self.metrics.log_dict(
self.metrics.peek(
(ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED), default={}
Expand Down Expand Up @@ -684,7 +687,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
# Sample a list of episodes used for learning from the replay buffer.
with self.metrics.log_time((TIMERS, REPLAY_BUFFER_SAMPLE_TIMER)):
episodes = self.local_replay_buffer.sample(
num_items=self.config.train_batch_size,
num_items=self.config.total_train_batch_size,
n_step=self.config.n_step,
gamma=self.config.gamma,
beta=self.config.replay_buffer_config.get("beta"),
Expand All @@ -707,14 +710,16 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
# disk or WandB, they might be very large).
td_errors = defaultdict(list)
for res in learner_results:
for mid, m_res in res.items():
if TD_ERROR_KEY in m_res:
td_errors[mid].extend(
convert_to_numpy(m_res.pop(TD_ERROR_KEY).peek())
for module_id, module_results in res.items():
if TD_ERROR_KEY in module_results:
td_errors[module_id].extend(
convert_to_numpy(
module_results.pop(TD_ERROR_KEY).peek()
)
)
td_errors = {
mid: {TD_ERROR_KEY: np.concatenate(s, axis=0)}
for mid, s in td_errors.items()
module_id: {TD_ERROR_KEY: np.concatenate(s, axis=0)}
for module_id, s in td_errors.items()
}
self.metrics.merge_and_log_n_dicts(
learner_results, key=LEARNER_RESULTS
Expand Down Expand Up @@ -812,7 +817,7 @@ def _training_step_old_and_hybrid_api_stack(self) -> ResultDict:
# Sample training batch (MultiAgentBatch) from replay buffer.
train_batch = sample_min_n_steps_from_buffer(
self.local_replay_buffer,
self.config.train_batch_size,
self.config.total_train_batch_size,
count_by_agent_steps=self.config.count_steps_by == "agent_steps",
)

Expand Down
3 changes: 2 additions & 1 deletion rllib/algorithms/sac/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ def __init__(self, algo_class=None):
}

# .training()
self.train_batch_size = 256
self.train_batch_size_per_learner = 256
self.train_batch_size = 256 # @OldAPIstack
# Number of timesteps to collect from rollout workers before we start
# sampling from replay buffers for learning. Whether we count this in agent
# steps or environment steps depends on config.multi_agent(count_steps_by=..).
Expand Down
6 changes: 5 additions & 1 deletion rllib/algorithms/sac/sac_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ def build(self) -> None:
self.curr_log_alpha: Dict[ModuleID, TensorType] = LambdaDefaultDict(
lambda module_id: self._get_tensor_variable(
# Note, we want to train the temperature parameter.
[np.log(self.config.get_config_for_module(module_id).initial_alpha)],
[
np.log(
self.config.get_config_for_module(module_id).initial_alpha
).astype(np.float32)
],
trainable=True,
)
)
Expand Down
Loading