Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] APPO config objects #24376

Merged
merged 8 commits into from
May 2, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rllib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def setup(self, config):

_setup_logger()

usage_lib.record_library_usage("rllib")
# usage_lib.record_library_usage("rllib")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please explain this? Can we remove it altogether?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry, this shouldn't be here. Will remove ...


__all__ = [
"Policy",
Expand Down
5 changes: 3 additions & 2 deletions rllib/agents/impala/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaTrainer
from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaConfig, ImpalaTrainer

__all__ = [
"DEFAULT_CONFIG",
"ImpalaConfig",
"ImpalaTrainer",
"DEFAULT_CONFIG",
]
384 changes: 277 additions & 107 deletions rllib/agents/impala/impala.py

Large diffs are not rendered by default.

47 changes: 28 additions & 19 deletions rllib/agents/impala/tests/test_impala.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,32 @@ def tearDownClass(cls) -> None:

def test_impala_compilation(self):
"""Test whether an ImpalaTrainer can be built with both frameworks."""
config = impala.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["model"]["lstm_use_prev_action"] = True
config["model"]["lstm_use_prev_reward"] = True
config = (
impala.ImpalaConfig()
.resources(num_gpus=0)
.training(
model={
"lstm_use_prev_action": True,
"lstm_use_prev_reward": True,
}
)
)

num_iterations = 1
env = "CartPole-v0"

for _ in framework_iterator(config, with_eager_tracing=True):
local_cfg = config.copy()
for lstm in [False, True]:
local_cfg["num_aggregation_workers"] = 0 if not lstm else 1
local_cfg["model"]["use_lstm"] = lstm
config.num_aggregation_workers = 0 if not lstm else 1
config.model["use_lstm"] = lstm
print(
"lstm={} aggregation-workers={}".format(
lstm, local_cfg["num_aggregation_workers"]
lstm, config.num_aggregation_workers
)
)
# Test with and w/o aggregation workers (this has nothing
# to do with LSTMs, though).
trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
trainer = config.build(env=env)
for i in range(num_iterations):
results = trainer.train()
check_train_results(results)
Expand All @@ -59,25 +65,28 @@ def test_impala_compilation(self):
trainer.stop()

def test_impala_lr_schedule(self):
config = impala.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
# Test whether we correctly ignore the "lr" setting.
# The first lr should be 0.05.
config["lr"] = 0.1
config["lr_schedule"] = [
[0, 0.05],
[10000, 0.000001],
]
config["num_gpus"] = 0 # Do not use any (fake) GPUs.
config["env"] = "CartPole-v0"
config = (
impala.ImpalaConfig()
.resources(num_gpus=0)
.training(
lr=0.1,
lr_schedule=[
[0, 0.05],
[10000, 0.000001],
],
)
)
config.environment(env="CartPole-v0")

def get_lr(result):
return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][
"cur_lr"
]

for fw in framework_iterator(config):
trainer = impala.ImpalaTrainer(config=config)
trainer = config.build()
policy = trainer.get_policy()

try:
Expand Down
3 changes: 2 additions & 1 deletion rllib/agents/ppo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer, DEFAULT_CONFIG
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
from ray.rllib.agents.ppo.appo import APPOTrainer
from ray.rllib.agents.ppo.appo import APPOConfig, APPOTrainer
from ray.rllib.agents.ppo.ddppo import DDPPOTrainer

__all__ = [
"APPOConfig",
"APPOTrainer",
"DDPPOTrainer",
"DEFAULT_CONFIG",
Expand Down
210 changes: 149 additions & 61 deletions rllib/agents/ppo/appo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,69 +23,140 @@
_get_shared_metrics,
)
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict

# fmt: off
# __sphinx_doc_begin__

# Adds the following updates to the `IMPALATrainer` config in
# rllib/agents/impala/impala.py.
DEFAULT_CONFIG = impala.ImpalaTrainer.merge_trainer_configs(
impala.DEFAULT_CONFIG, # See keys in impala.py, which are also supported.
{
# Whether to use V-trace weighted advantages. If false, PPO GAE
# advantages will be used instead.
"vtrace": True,

# == These two options only apply if vtrace: False ==
# Should use a critic as a baseline (otherwise don't use value
# baseline; required for using GAE).
"use_critic": True,
# If true, use the Generalized Advantage Estimator (GAE)
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
"use_gae": True,
# GAE(lambda) parameter
"lambda": 1.0,

# == PPO surrogate loss options ==
"clip_param": 0.4,

# == PPO KL Loss options ==
"use_kl_loss": False,
"kl_coeff": 1.0,
"kl_target": 0.01,

# == IMPALA optimizer params (see documentation in impala.py) ==
"rollout_fragment_length": 50,
"train_batch_size": 500,
"min_time_s_per_reporting": 10,
"num_workers": 2,
"num_gpus": 0,
"num_multi_gpu_tower_stacks": 1,
"minibatch_buffer_size": 1,
"num_sgd_iter": 1,
"replay_proportion": 0.0,
"replay_buffer_num_slots": 100,
"learner_queue_size": 16,
"learner_queue_timeout": 300,
"max_sample_requests_in_flight_per_worker": 2,
"broadcast_interval": 1,
"grad_clip": 40.0,
"opt_type": "adam",
"lr": 0.0005,
"lr_schedule": None,
"decay": 0.99,
"momentum": 0.0,
"epsilon": 0.1,
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01,
"entropy_coeff_schedule": None,
},
_allow_unknown_configs=True,
)

# __sphinx_doc_end__
# fmt: on
class APPOConfig(impala.ImpalaConfig):
"""Defines a A2CTrainer configuration class from which a new Trainer can be built.

Example:
>>> from ray import tune
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

APPOConfig import

>>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\
... .resources(num_gpus=1)\
... .rollouts(num_rollout_workers=16)
>>> print(config.to_dict())
>>> # Build a Trainer object from the config and run 1 training iteration.
>>> trainer = config.build(env="CartPole-v1")
>>> trainer.train()

Example:
>>> config = APPOConfig()
>>> # Print out some default values.
>>> print(config.sample_async)
>>> # Update the config object.
>>> config.training(lr=tune.grid_search([0.001, 0.0001]))
>>> # Set the config object's env.
>>> config.environment(env="CartPole-v1")
>>> # Use to_dict() to get the old-style python config dict
>>> # when running with tune.
>>> tune.run(
... "APPO",
... stop={"episode_reward_mean": 200},
... config=config.to_dict(),
... )
"""

def __init__(self, trainer_class=None):
"""Initializes a APPOConfig instance."""
super().__init__(trainer_class=trainer_class or APPOTrainer)

# fmt: off
# __sphinx_doc_begin__

# APPO specific settings:
self.vtrace = True
self.use_critic = True
self.use_gae = True
self.lambda_ = 1.0
self.clip_param = 0.4
self.use_kl_loss = False
self.kl_coeff = 1.0
self.kl_target = 0.01

# Override some of ImpalaConfig's default values with APPO-specific values.
self.rollout_fragment_length = 50
self.train_batch_size = 500
self.min_time_s_per_reporting = 10
self.num_workers = 2
self.num_gpus = 0
self.num_multi_gpu_tower_stacks = 1
self.minibatch_buffer_size = 1
self.num_sgd_iter = 1
self.replay_proportion = 0.0
self.replay_buffer_num_slots = 100
self.learner_queue_size = 16
self.learner_queue_timeout = 300
self.max_sample_requests_in_flight_per_worker = 2
self.broadcast_interval = 1
self.grad_clip = 40.0
self.opt_type = "adam"
self.lr = 0.0005
self.lr_schedule = None
self.decay = 0.99
self.momentum = 0.0
self.epsilon = 0.1
self.vf_loss_coeff = 0.5
self.entropy_coeff = 0.01
self.entropy_coeff_schedule = None
# __sphinx_doc_end__
# fmt: on

@override(impala.ImpalaConfig)
def training(
self,
*,
vtrace: Optional[bool] = None,
use_critic: Optional[bool] = None,
use_gae: Optional[bool] = None,
lambda_: Optional[float] = None,
clip_param: Optional[float] = None,
use_kl_loss: Optional[bool] = None,
kl_coeff: Optional[float] = None,
kl_target: Optional[float] = None,
**kwargs,
) -> "APPOConfig":
"""Sets the training related configuration.

Args:
vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
advantages will be used instead.
use_critic: Should use a critic as a baseline (otherwise don't use value
baseline; required for using GAE). Only applies if vtrace=False.
use_gae: If true, use the Generalized Advantage Estimator (GAE)
with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
Only applies if vtrace=False.
lambda_: GAE (lambda) parameter.
clip_param: PPO surrogate slipping parameter.
use_kl_loss: Whether to use the KL-term in the loss function.
kl_coeff: Coefficient for weighting the KL-loss term.
kl_target: Target term for the KL-term to reach (via adjusting the
`kl_coeff` automatically).

Returns:
This updated TrainerConfig object.
"""
# Pass kwargs onto super's `training()` method.
super().training(**kwargs)

if vtrace is not None:
self.vtrace = vtrace
if use_critic is not None:
self.use_critic = use_critic
if use_gae is not None:
self.use_gae = use_gae
if lambda_ is not None:
self.lambda_ = lambda_
if clip_param is not None:
self.clip_param = clip_param
if use_kl_loss is not None:
self.use_kl_loss = use_kl_loss
if kl_coeff is not None:
self.kl_coeff = kl_coeff
if kl_target is not None:
self.kl_target = kl_target

return self


class UpdateTargetAndKL:
Expand Down Expand Up @@ -130,7 +201,7 @@ def __init__(self, config, *args, **kwargs):
@classmethod
@override(Trainer)
def get_default_config(cls) -> TrainerConfigDict:
return DEFAULT_CONFIG
return APPOConfig().to_dict()

@override(Trainer)
def get_default_policy_class(
Expand All @@ -142,3 +213,20 @@ def get_default_policy_class(
return AsyncPPOTorchPolicy
else:
return AsyncPPOTFPolicy


# Deprecated: Use ray.rllib.agents.a3c.A3CConfig instead!
class _deprecated_default_config(dict):
def __init__(self):
super().__init__(APPOConfig().to_dict())

@Deprecated(
old="ray.rllib.agents.ppo.appo.DEFAULT_CONFIG",
new="ray.rllib.agents.ppo.appo.APPOConfig(...)",
error=False,
)
def __getitem__(self, item):
return super().__getitem__(item)


DEFAULT_CONFIG = _deprecated_default_config()
Loading