Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] APPO config objects #24376

Merged
merged 8 commits into from
May 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions rllib/agents/impala/impala.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ class ImpalaConfig(TrainerConfig):
... )
"""

def __init__(self):
def __init__(self, trainer_class=None):
"""Initializes a ImpalaConfig instance."""
super().__init__(trainer_class=ImpalaTrainer)
super().__init__(trainer_class=trainer_class or ImpalaTrainer)

# fmt: off
# __sphinx_doc_begin__
Expand Down
3 changes: 2 additions & 1 deletion rllib/agents/ppo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer, DEFAULT_CONFIG
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
from ray.rllib.agents.ppo.appo import APPOTrainer
from ray.rllib.agents.ppo.appo import APPOConfig, APPOTrainer
from ray.rllib.agents.ppo.ddppo import DDPPOTrainer

__all__ = [
"APPOConfig",
"APPOTrainer",
"DDPPOTrainer",
"DEFAULT_CONFIG",
Expand Down
212 changes: 151 additions & 61 deletions rllib/agents/ppo/appo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,69 +23,142 @@
_get_shared_metrics,
)
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict

# fmt: off
# __sphinx_doc_begin__

# Adds the following updates to the `IMPALATrainer` config in
# rllib/agents/impala/impala.py.
DEFAULT_CONFIG = impala.ImpalaTrainer.merge_trainer_configs(
impala.DEFAULT_CONFIG, # See keys in impala.py, which are also supported.
{
# Whether to use V-trace weighted advantages. If false, PPO GAE
# advantages will be used instead.
"vtrace": True,

# == These two options only apply if vtrace: False ==
# Should use a critic as a baseline (otherwise don't use value
# baseline; required for using GAE).
"use_critic": True,
# If true, use the Generalized Advantage Estimator (GAE)
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
"use_gae": True,
# GAE(lambda) parameter
"lambda": 1.0,

# == PPO surrogate loss options ==
"clip_param": 0.4,

# == PPO KL Loss options ==
"use_kl_loss": False,
"kl_coeff": 1.0,
"kl_target": 0.01,

# == IMPALA optimizer params (see documentation in impala.py) ==
"rollout_fragment_length": 50,
"train_batch_size": 500,
"min_time_s_per_reporting": 10,
"num_workers": 2,
"num_gpus": 0,
"num_multi_gpu_tower_stacks": 1,
"minibatch_buffer_size": 1,
"num_sgd_iter": 1,
"replay_proportion": 0.0,
"replay_buffer_num_slots": 100,
"learner_queue_size": 16,
"learner_queue_timeout": 300,
"max_sample_requests_in_flight_per_worker": 2,
"broadcast_interval": 1,
"grad_clip": 40.0,
"opt_type": "adam",
"lr": 0.0005,
"lr_schedule": None,
"decay": 0.99,
"momentum": 0.0,
"epsilon": 0.1,
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01,
"entropy_coeff_schedule": None,
},
_allow_unknown_configs=True,
)

# __sphinx_doc_end__
# fmt: on
class APPOConfig(impala.ImpalaConfig):
"""Defines a APPOTrainer configuration class from which a new Trainer can be built.

Example:
>>> from ray.rllib.agents.ppo import APPOConfig
>>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\
... .resources(num_gpus=1)\
... .rollouts(num_rollout_workers=16)
>>> print(config.to_dict())
>>> # Build a Trainer object from the config and run 1 training iteration.
>>> trainer = config.build(env="CartPole-v1")
>>> trainer.train()

Example:
>>> from ray.rllib.agents.ppo import APPOConfig
>>> from ray import tune
>>> config = APPOConfig()
>>> # Print out some default values.
>>> print(config.sample_async)
>>> # Update the config object.
>>> config.training(lr=tune.grid_search([0.001, 0.0001]))
>>> # Set the config object's env.
>>> config.environment(env="CartPole-v1")
>>> # Use to_dict() to get the old-style python config dict
>>> # when running with tune.
>>> tune.run(
... "APPO",
... stop={"episode_reward_mean": 200},
... config=config.to_dict(),
... )
"""

def __init__(self, trainer_class=None):
"""Initializes a APPOConfig instance."""
super().__init__(trainer_class=trainer_class or APPOTrainer)

# fmt: off
# __sphinx_doc_begin__

# APPO specific settings:
self.vtrace = True
self.use_critic = True
self.use_gae = True
self.lambda_ = 1.0
self.clip_param = 0.4
self.use_kl_loss = False
self.kl_coeff = 1.0
self.kl_target = 0.01

# Override some of ImpalaConfig's default values with APPO-specific values.
self.rollout_fragment_length = 50
self.train_batch_size = 500
self.min_time_s_per_reporting = 10
self.num_workers = 2
self.num_gpus = 0
self.num_multi_gpu_tower_stacks = 1
self.minibatch_buffer_size = 1
self.num_sgd_iter = 1
self.replay_proportion = 0.0
self.replay_buffer_num_slots = 100
self.learner_queue_size = 16
self.learner_queue_timeout = 300
self.max_sample_requests_in_flight_per_worker = 2
self.broadcast_interval = 1
self.grad_clip = 40.0
self.opt_type = "adam"
self.lr = 0.0005
self.lr_schedule = None
self.decay = 0.99
self.momentum = 0.0
self.epsilon = 0.1
self.vf_loss_coeff = 0.5
self.entropy_coeff = 0.01
self.entropy_coeff_schedule = None
# __sphinx_doc_end__
# fmt: on

@override(impala.ImpalaConfig)
def training(
self,
*,
vtrace: Optional[bool] = None,
use_critic: Optional[bool] = None,
use_gae: Optional[bool] = None,
lambda_: Optional[float] = None,
clip_param: Optional[float] = None,
use_kl_loss: Optional[bool] = None,
kl_coeff: Optional[float] = None,
kl_target: Optional[float] = None,
**kwargs,
) -> "APPOConfig":
"""Sets the training related configuration.

Args:
vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
advantages will be used instead.
use_critic: Should use a critic as a baseline (otherwise don't use value
baseline; required for using GAE). Only applies if vtrace=False.
use_gae: If true, use the Generalized Advantage Estimator (GAE)
with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
Only applies if vtrace=False.
lambda_: GAE (lambda) parameter.
clip_param: PPO surrogate slipping parameter.
use_kl_loss: Whether to use the KL-term in the loss function.
kl_coeff: Coefficient for weighting the KL-loss term.
kl_target: Target term for the KL-term to reach (via adjusting the
`kl_coeff` automatically).

Returns:
This updated TrainerConfig object.
"""
# Pass kwargs onto super's `training()` method.
super().training(**kwargs)

if vtrace is not None:
self.vtrace = vtrace
if use_critic is not None:
self.use_critic = use_critic
if use_gae is not None:
self.use_gae = use_gae
if lambda_ is not None:
self.lambda_ = lambda_
if clip_param is not None:
self.clip_param = clip_param
if use_kl_loss is not None:
self.use_kl_loss = use_kl_loss
if kl_coeff is not None:
self.kl_coeff = kl_coeff
if kl_target is not None:
self.kl_target = kl_target

return self


class UpdateTargetAndKL:
Expand Down Expand Up @@ -130,7 +203,7 @@ def __init__(self, config, *args, **kwargs):
@classmethod
@override(Trainer)
def get_default_config(cls) -> TrainerConfigDict:
return DEFAULT_CONFIG
return APPOConfig().to_dict()

@override(Trainer)
def get_default_policy_class(
Expand All @@ -142,3 +215,20 @@ def get_default_policy_class(
return AsyncPPOTorchPolicy
else:
return AsyncPPOTFPolicy


# Deprecated: Use ray.rllib.agents.ppo.APPOConfig instead!
class _deprecated_default_config(dict):
def __init__(self):
super().__init__(APPOConfig().to_dict())

@Deprecated(
old="ray.rllib.agents.ppo.appo.DEFAULT_CONFIG",
new="ray.rllib.agents.ppo.appo.APPOConfig(...)",
error=False,
)
def __getitem__(self, item):
return super().__getitem__(item)


DEFAULT_CONFIG = _deprecated_default_config()
77 changes: 41 additions & 36 deletions rllib/agents/ppo/tests/test_appo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,13 @@ def tearDownClass(cls):

def test_appo_compilation(self):
"""Test whether an APPOTrainer can be built with both frameworks."""
config = ppo.appo.DEFAULT_CONFIG.copy()
config["num_workers"] = 1
config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1)
num_iterations = 2

for _ in framework_iterator(config, with_eager_tracing=True):
print("w/o v-trace")
_config = config.copy()
_config["vtrace"] = False
trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0")
config.vtrace = False
trainer = config.build(env="CartPole-v0")
for i in range(num_iterations):
results = trainer.train()
check_train_results(results)
Expand All @@ -39,9 +37,8 @@ def test_appo_compilation(self):
trainer.stop()

print("w/ v-trace")
_config = config.copy()
_config["vtrace"] = True
trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0")
config.vtrace = True
trainer = config.build(env="CartPole-v0")
for i in range(num_iterations):
results = trainer.train()
check_train_results(results)
Expand All @@ -51,13 +48,15 @@ def test_appo_compilation(self):

def test_appo_compilation_use_kl_loss(self):
"""Test whether an APPOTrainer can be built with kl_loss enabled."""
config = ppo.appo.DEFAULT_CONFIG.copy()
config["num_workers"] = 1
config["use_kl_loss"] = True
config = (
ppo.appo.APPOConfig()
.rollouts(num_rollout_workers=1)
.training(use_kl_loss=True)
)
num_iterations = 2

for _ in framework_iterator(config, with_eager_tracing=True):
trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
trainer = config.build(env="CartPole-v0")
for i in range(num_iterations):
results = trainer.train()
check_train_results(results)
Expand All @@ -66,22 +65,22 @@ def test_appo_compilation_use_kl_loss(self):
trainer.stop()

def test_appo_two_tf_optimizers(self):
config = ppo.appo.DEFAULT_CONFIG.copy()
config["num_workers"] = 1

# Not explicitly setting this should cause a warning, but not fail.
# config["_tf_policy_handles_more_than_one_loss"] = True
config["_separate_vf_optimizer"] = True
config["_lr_vf"] = 0.0002

config = (
ppo.appo.APPOConfig()
.rollouts(num_rollout_workers=1)
.training(_separate_vf_optimizer=True, _lr_vf=0.002)
)
# Make sure we have two completely separate models for policy and
# value function.
config["model"]["vf_share_layers"] = False
config.model["vf_share_layers"] = False

num_iterations = 2

# Only supported for tf so far.
for _ in framework_iterator(config, frameworks=("tf2", "tf")):
trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
trainer = config.build(env="CartPole-v0")
for i in range(num_iterations):
results = trainer.train()
check_train_results(results)
Expand All @@ -90,23 +89,29 @@ def test_appo_two_tf_optimizers(self):
trainer.stop()

def test_appo_entropy_coeff_schedule(self):
config = ppo.appo.DEFAULT_CONFIG.copy()
config["num_workers"] = 1
config["num_gpus"] = 0
config["train_batch_size"] = 20
config["batch_mode"] = "truncate_episodes"
config["rollout_fragment_length"] = 10
config["min_sample_timesteps_per_reporting"] = 20
# Initial lr, doesn't really matter because of the schedule below.
config = (
ppo.appo.APPOConfig()
.rollouts(
num_rollout_workers=1,
batch_mode="truncate_episodes",
rollout_fragment_length=10,
)
.resources(num_gpus=0)
.training(
train_batch_size=20,
entropy_coeff=0.01,
entropy_coeff_schedule=[
[0, 0.01],
[120, 0.0001],
],
)
)

config.min_sample_timesteps_per_reporting = 20
# 0 metrics reporting delay, this makes sure timestep,
# which entropy coeff depends on, is updated after each worker rollout.
config["min_time_s_per_reporting"] = 0
# Initial lr, doesn't really matter because of the schedule below.
config["entropy_coeff"] = 0.01
schedule = [
[0, 0.01],
[120, 0.0001],
]
config["entropy_coeff_schedule"] = schedule
config.min_time_s_per_reporting = 0

def _step_n_times(trainer, n: int):
"""Step trainer n times.
Expand All @@ -121,7 +126,7 @@ def _step_n_times(trainer, n: int):
]

for _ in framework_iterator(config):
trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
trainer = config.build(env="CartPole-v0")

coeff = _step_n_times(trainer, 1) # 20 timesteps
# Should be close to the starting coeff of 0.01.
Expand Down