From d91ff3a4a70c96c34ee4aa6b9c23a07f20431c86 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 1 May 2022 21:15:39 +0200 Subject: [PATCH 1/5] wip --- rllib/agents/impala/__init__.py | 5 +- rllib/agents/impala/impala.py | 384 ++++++++++++++++------- rllib/agents/impala/tests/test_impala.py | 47 +-- 3 files changed, 308 insertions(+), 128 deletions(-) diff --git a/rllib/agents/impala/__init__.py b/rllib/agents/impala/__init__.py index ed24770f6f88..07c4f39abc9a 100644 --- a/rllib/agents/impala/__init__.py +++ b/rllib/agents/impala/__init__.py @@ -1,6 +1,7 @@ -from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaTrainer +from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaConfig, ImpalaTrainer __all__ = [ - "DEFAULT_CONFIG", + "ImpalaConfig", "ImpalaTrainer", + "DEFAULT_CONFIG", ] diff --git a/rllib/agents/impala/impala.py b/rllib/agents/impala/impala.py index c431d1ef4114..a1af09ad3275 100644 --- a/rllib/agents/impala/impala.py +++ b/rllib/agents/impala/impala.py @@ -1,9 +1,9 @@ import logging -from typing import Optional, Type +from typing import Callable, List, Optional, Type, Union import ray from ray.rllib.agents.impala.vtrace_tf_policy import VTraceTFPolicy -from ray.rllib.agents.trainer import Trainer, with_common_config +from ray.rllib.agents.trainer import Trainer, TrainerConfig from ray.rllib.execution.learner_thread import LearnerThread from ray.rllib.execution.multi_gpu_learner_thread import MultiGPULearnerThread from ray.rllib.execution.tree_agg import gather_experiences_tree_aggregation @@ -19,115 +19,255 @@ from ray.rllib.execution.metric_ops import StandardMetricsReporting from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray.rllib.utils.deprecation import ( + Deprecated, + DEPRECATED_VALUE, + deprecation_warning, +) from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict from ray.tune.utils.placement_groups import PlacementGroupFactory logger = logging.getLogger(__name__) -# fmt: off -# __sphinx_doc_begin__ -DEFAULT_CONFIG = with_common_config({ - # V-trace params (see vtrace_tf/torch.py). - "vtrace": True, - "vtrace_clip_rho_threshold": 1.0, - "vtrace_clip_pg_rho_threshold": 1.0, - # If True, drop the last timestep for the vtrace calculations, such that - # all data goes into the calculations as [B x T-1] (+ the bootstrap value). - # This is the default and legacy RLlib behavior, however, could potentially - # have a destabilizing effect on learning, especially in sparse reward - # or reward-at-goal environments. - # False for not dropping the last timestep. - "vtrace_drop_last_ts": True, - # System params. - # - # == Overview of data flow in IMPALA == - # 1. Policy evaluation in parallel across `num_workers` actors produces - # batches of size `rollout_fragment_length * num_envs_per_worker`. - # 2. If enabled, the replay buffer stores and produces batches of size - # `rollout_fragment_length * num_envs_per_worker`. - # 3. If enabled, the minibatch ring buffer stores and replays batches of - # size `train_batch_size` up to `num_sgd_iter` times per batch. - # 4. The learner thread executes data parallel SGD across `num_gpus` GPUs - # on batches of size `train_batch_size`. - # - "rollout_fragment_length": 50, - "train_batch_size": 500, - "min_time_s_per_reporting": 10, - "num_workers": 2, - # Number of GPUs the learner should use. - "num_gpus": 1, - # For each stack of multi-GPU towers, how many slots should we reserve for - # parallel data loading? Set this to >1 to load data into GPUs in - # parallel. This will increase GPU memory usage proportionally with the - # number of stacks. - # Example: - # 2 GPUs and `num_multi_gpu_tower_stacks=3`: - # - One tower stack consists of 2 GPUs, each with a copy of the - # model/graph. - # - Each of the stacks will create 3 slots for batch data on each of its - # GPUs, increasing memory requirements on each GPU by 3x. - # - This enables us to preload data into these stacks while another stack - # is performing gradient calculations. - "num_multi_gpu_tower_stacks": 1, - # How many train batches should be retained for minibatching. This conf - # only has an effect if `num_sgd_iter > 1`. - "minibatch_buffer_size": 1, - # Number of passes to make over each train batch. - "num_sgd_iter": 1, - # Set >0 to enable experience replay. Saved samples will be replayed with - # a p:1 proportion to new data samples. - "replay_proportion": 0.0, - # Number of sample batches to store for replay. The number of transitions - # saved total will be (replay_buffer_num_slots * rollout_fragment_length). - "replay_buffer_num_slots": 0, - # Max queue size for train batches feeding into the learner. - "learner_queue_size": 16, - # Wait for train batches to be available in minibatch buffer queue - # this many seconds. This may need to be increased e.g. when training - # with a slow environment. - "learner_queue_timeout": 300, - # Level of queuing for sampling. - "max_sample_requests_in_flight_per_worker": 2, - # Max number of workers to broadcast one set of weights to. - "broadcast_interval": 1, - # Use n (`num_aggregation_workers`) extra Actors for multi-level - # aggregation of the data produced by the m RolloutWorkers - # (`num_workers`). Note that n should be much smaller than m. - # This can make sense if ingesting >2GB/s of samples, or if - # the data requires decompression. - "num_aggregation_workers": 0, - - # Learning params. - "grad_clip": 40.0, - # Either "adam" or "rmsprop". - "opt_type": "adam", - "lr": 0.0005, - "lr_schedule": None, - # `opt_type=rmsprop` settings. - "decay": 0.99, - "momentum": 0.0, - "epsilon": 0.1, - # Balancing the three losses. - "vf_loss_coeff": 0.5, - "entropy_coeff": 0.01, - "entropy_coeff_schedule": None, - # Set this to true to have two separate optimizers optimize the policy- - # and value networks. - "_separate_vf_optimizer": False, - # If _separate_vf_optimizer is True, define separate learning rate - # for the value network. - "_lr_vf": 0.0005, - - # Callback for APPO to use to update KL, target network periodically. - # The input to the callback is the learner fetches dict. - "after_train_step": None, - - # DEPRECATED: - "num_data_loader_buffers": DEPRECATED_VALUE, -}) -# __sphinx_doc_end__ -# fmt: on + +class ImpalaConfig(TrainerConfig): + """Defines an ARSTrainer configuration class from which an ImpalaTrainer can be built. + + Example: + >>> config = ImpalaConfig().training(lr=0.0003, train_batch_size=512)\ + ... .resources(num_gpus=4)\ + ... .rollouts(num_rollout_workers=64) + >>> print(config.to_dict()) + >>> # Build a Trainer object from the config and run 1 training iteration. + >>> trainer = config.build(env="CartPole-v1") + >>> trainer.train() + + Example: + >>> from ray import tune + >>> config = ImpalaConfig() + >>> # Print out some default values. + >>> print(config.vtrace) + >>> # Update the config object. + >>> config.training(lr=tune.grid_search([0.0001, 0.0003]), grad_clip=20.0) + >>> # Set the config object's env. + >>> config.environment(env="CartPole-v1") + >>> # Use to_dict() to get the old-style python config dict + >>> # when running with tune. + >>> tune.run( + ... "IMPALA", + ... stop={"episode_reward_mean": 200}, + ... config=config.to_dict(), + ... ) + """ + + def __init__(self): + """Initializes a ImpalaConfig instance.""" + super().__init__(trainer_class=ImpalaTrainer) + + # fmt: off + # __sphinx_doc_begin__ + + # IMPALA specific settings: + self.vtrace = True + self.vtrace_clip_rho_threshold = 1.0 + self.vtrace_clip_pg_rho_threshold = 1.0 + self.vtrace_drop_last_ts = True + self.num_multi_gpu_tower_stacks = 1 + self.minibatch_buffer_size = 1 + self.num_sgd_iter = 1 + self.replay_proportion = 0.0 + self.replay_buffer_num_slots = 0 + self.learner_queue_size = 16 + self.learner_queue_timeout = 300 + self.max_sample_requests_in_flight_per_worker = 2 + self.broadcast_interval = 1 + self.num_aggregation_workers = 0 + self.grad_clip = 40.0 + self.opt_type = "adam" + self.lr_schedule = None + self.decay = 0.99 + self.momentum = 0.0 + self.epsilon = 0.1 + self.vf_loss_coeff = 0.5 + self.entropy_coeff = 0.01 + self.entropy_coeff_schedule = None + self._separate_vf_optimizer = False + self._lr_vf = 0.0005 + self.after_train_step = None + + # Override some of TrainerConfig's default values with ARS-specific values. + self.rollout_fragment_length = 50 + self.train_batch_size = 500 + self.num_workers = 2 + self.num_gpus = 1 + self.lr = 0.0005 + self.min_time_s_per_reporting = 10 + # __sphinx_doc_end__ + # fmt: on + + # Deprecated value. + self.num_data_loader_buffers = DEPRECATED_VALUE + + @override(TrainerConfig) + def training( + self, + *, + vtrace: Optional[bool] = None, + vtrace_clip_rho_threshold: Optional[float] = None, + vtrace_clip_pg_rho_threshold: Optional[float] = None, + vtrace_drop_last_ts: Optional[bool] = None, + num_multi_gpu_tower_stacks: Optional[int] = None, + minibatch_buffer_size: Optional[int] = None, + num_sgd_iter: Optional[int] = None, + replay_proportion: Optional[float] = None, + replay_buffer_num_slots: Optional[int] = None, + learner_queue_size: Optional[int] = None, + learner_queue_timeout: Optional[float] = None, + max_sample_requests_in_flight_per_worker: Optional[int] = None, + broadcast_interval: Optional[int] = None, + num_aggregation_workers: Optional[int] = None, + grad_clip: Optional[float] = None, + opt_type: Optional[str] = None, + lr_schedule: Optional[List[List[Union[int, float]]]] = None, + decay: Optional[float] = None, + momentum: Optional[float] = None, + epsilon: Optional[float] = None, + vf_loss_coeff: Optional[float] = None, + entropy_coeff: Optional[float] = None, + entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None, + _separate_vf_optimizer: Optional[bool] = None, + _lr_vf: Optional[float] = None, + after_train_step: Optional[Callable[[dict], None]] = None, + **kwargs, + ) -> "ImpalaConfig": + """Sets the training related configuration. + + Args: + vtrace: V-trace params (see vtrace_tf/torch.py). + vtrace_clip_rho_threshold: + vtrace_clip_pg_rho_threshold: + vtrace_drop_last_ts: If True, drop the last timestep for the vtrace + calculations, such that all data goes into the calculations as [B x T-1] + (+ the bootstrap value). This is the default and legacy RLlib behavior, + however, could potentially have a destabilizing effect on learning, + especially in sparse reward or reward-at-goal environments. + False for not dropping the last timestep. + System params. + num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many + slots should we reserve for parallel data loading? Set this to >1 to + load data into GPUs in parallel. This will increase GPU memory usage + proportionally with the number of stacks. + Example: + 2 GPUs and `num_multi_gpu_tower_stacks=3`: + - One tower stack consists of 2 GPUs, each with a copy of the + model/graph. + - Each of the stacks will create 3 slots for batch data on each of its + GPUs, increasing memory requirements on each GPU by 3x. + - This enables us to preload data into these stacks while another stack + is performing gradient calculations. + minibatch_buffer_size: How many train batches should be retained for + minibatching. This conf only has an effect if `num_sgd_iter > 1`. + num_sgd_iter: Number of passes to make over each train batch. + replay_proportion: Set >0 to enable experience replay. Saved samples will + be replayed with a p:1 proportion to new data samples. + replay_buffer_num_slots: Number of sample batches to store for replay. + The number of transitions saved total will be + (replay_buffer_num_slots * rollout_fragment_length). + learner_queue_size: Max queue size for train batches feeding into the + learner. + learner_queue_timeout: Wait for train batches to be available in minibatch + buffer queue this many seconds. This may need to be increased e.g. when + training with a slow environment. + max_sample_requests_in_flight_per_worker: Level of queuing for sampling. + broadcast_interval: Max number of workers to broadcast one set of + weights to. + + num_aggregation_workers: Use n (`num_aggregation_workers`) extra Actors for + multi-level aggregation of the data produced by the m RolloutWorkers + (`num_workers`). Note that n should be much smaller than m. + This can make sense if ingesting >2GB/s of samples, or if + the data requires decompression. + grad_clip: + opt_type: Either "adam" or "rmsprop". + lr_schedule: + + decay: `opt_type=rmsprop` settings. + momentum: + epsilon: + + vf_loss_coeff: Coefficient for the value function term in the loss function. + entropy_coeff: Coefficient for the entropy regularizer term in the loss + function. + entropy_coeff_schedule: + _separate_vf_optimizer: Set this to true to have two separate optimizers + optimize the policy-and value networks. + _lr_vf: If _separate_vf_optimizer is True, define separate learning rate + for the value network. + after_train_step: Callback for APPO to use to update KL, target network + periodically. The input to the callback is the learner fetches dict. + + Returns: + This updated TrainerConfig object. + """ + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if vtrace is not None: + self.vtrace = vtrace + if vtrace_clip_rho_threshold is not None: + self.vtrace_clip_rho_threshold = vtrace_clip_rho_threshold + if vtrace_clip_pg_rho_threshold is not None: + self.vtrace_clip_pg_rho_threshold = vtrace_clip_pg_rho_threshold + if vtrace_drop_last_ts is not None: + self.vtrace_drop_last_ts = vtrace_drop_last_ts + if num_multi_gpu_tower_stacks is not None: + self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks + if minibatch_buffer_size is not None: + self.minibatch_buffer_size = minibatch_buffer_size + if num_sgd_iter is not None: + self.num_sgd_iter = num_sgd_iter + if replay_proportion is not None: + self.replay_proportion = replay_proportion + if replay_buffer_num_slots is not None: + self.replay_buffer_num_slots = replay_buffer_num_slots + if learner_queue_size is not None: + self.learner_queue_size = learner_queue_size + if learner_queue_timeout is not None: + self.learner_queue_timeout = learner_queue_timeout + if max_sample_requests_in_flight_per_worker is not None: + self.max_sample_requests_in_flight_per_worker = ( + max_sample_requests_in_flight_per_worker + ) + if broadcast_interval is not None: + self.broadcast_interval = broadcast_interval + if num_aggregation_workers is not None: + self.num_aggregation_workers = num_aggregation_workers + if grad_clip is not None: + self.grad_clip = grad_clip + if opt_type is not None: + self.opt_type = opt_type + if lr_schedule is not None: + self.lr_schedule = lr_schedule + if decay is not None: + self.decay = decay + if momentum is not None: + self.momentum = momentum + if epsilon is not None: + self.epsilon = epsilon + if vf_loss_coeff is not None: + self.vf_loss_coeff = vf_loss_coeff + if entropy_coeff is not None: + self.entropy_coeff = entropy_coeff + if entropy_coeff_schedule is not None: + self.entropy_coeff_schedule = entropy_coeff_schedule + if _separate_vf_optimizer is not None: + self._separate_vf_optimizer = _separate_vf_optimizer + if _lr_vf is not None: + self._lr_vf = _lr_vf + if after_train_step is not None: + self.after_train_step = after_train_step + + return self def make_learner_thread(local_worker, config): @@ -227,10 +367,23 @@ def __call__(self, item): class ImpalaTrainer(Trainer): + """Importance weighted actor/learner architecture (IMPALA) Trainer + + == Overview of data flow in IMPALA == + 1. Policy evaluation in parallel across `num_workers` actors produces + batches of size `rollout_fragment_length * num_envs_per_worker`. + 2. If enabled, the replay buffer stores and produces batches of size + `rollout_fragment_length * num_envs_per_worker`. + 3. If enabled, the minibatch ring buffer stores and replays batches of + size `train_batch_size` up to `num_sgd_iter` times per batch. + 4. The learner thread executes data parallel SGD across `num_gpus` GPUs + on batches of size `train_batch_size`. + """ + @classmethod @override(Trainer) def get_default_config(cls) -> TrainerConfigDict: - return DEFAULT_CONFIG + return ImpalaConfig().to_dict() @override(Trainer) def get_default_policy_class( @@ -412,3 +565,20 @@ def default_resource_request(cls, config): ), strategy=config.get("placement_strategy", "PACK"), ) + + +# Deprecated: Use ray.rllib.agents.pg.PGConfig instead! +class _deprecated_default_config(dict): + def __init__(self): + super().__init__(ImpalaConfig().to_dict()) + + @Deprecated( + old="ray.rllib.agents.impala.default_config::DEFAULT_CONFIG", + new="ray.rllib.agents.impala.impala.IMPALAConfig(...)", + error=False, + ) + def __getitem__(self, item): + return super().__getitem__(item) + + +DEFAULT_CONFIG = _deprecated_default_config() diff --git a/rllib/agents/impala/tests/test_impala.py b/rllib/agents/impala/tests/test_impala.py index 4b9790fa7cb5..6f3c18b2964a 100644 --- a/rllib/agents/impala/tests/test_impala.py +++ b/rllib/agents/impala/tests/test_impala.py @@ -26,26 +26,32 @@ def tearDownClass(cls) -> None: def test_impala_compilation(self): """Test whether an ImpalaTrainer can be built with both frameworks.""" - config = impala.DEFAULT_CONFIG.copy() - config["num_gpus"] = 0 - config["model"]["lstm_use_prev_action"] = True - config["model"]["lstm_use_prev_reward"] = True + config = ( + impala.ImpalaConfig() + .resources(num_gpus=0) + .training( + model={ + "lstm_use_prev_action": True, + "lstm_use_prev_reward": True, + } + ) + ) + num_iterations = 1 env = "CartPole-v0" for _ in framework_iterator(config, with_eager_tracing=True): - local_cfg = config.copy() for lstm in [False, True]: - local_cfg["num_aggregation_workers"] = 0 if not lstm else 1 - local_cfg["model"]["use_lstm"] = lstm + config.num_aggregation_workers = 0 if not lstm else 1 + config.model["use_lstm"] = lstm print( "lstm={} aggregation-workers={}".format( - lstm, local_cfg["num_aggregation_workers"] + lstm, config.num_aggregation_workers ) ) # Test with and w/o aggregation workers (this has nothing # to do with LSTMs, though). - trainer = impala.ImpalaTrainer(config=local_cfg, env=env) + trainer = config.build(env=env) for i in range(num_iterations): results = trainer.train() check_train_results(results) @@ -59,17 +65,20 @@ def test_impala_compilation(self): trainer.stop() def test_impala_lr_schedule(self): - config = impala.DEFAULT_CONFIG.copy() - config["num_gpus"] = 0 # Test whether we correctly ignore the "lr" setting. # The first lr should be 0.05. - config["lr"] = 0.1 - config["lr_schedule"] = [ - [0, 0.05], - [10000, 0.000001], - ] - config["num_gpus"] = 0 # Do not use any (fake) GPUs. - config["env"] = "CartPole-v0" + config = ( + impala.ImpalaConfig() + .resources(num_gpus=0) + .training( + lr=0.1, + lr_schedule=[ + [0, 0.05], + [10000, 0.000001], + ], + ) + ) + config.environment(env="CartPole-v0") def get_lr(result): return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ @@ -77,7 +86,7 @@ def get_lr(result): ] for fw in framework_iterator(config): - trainer = impala.ImpalaTrainer(config=config) + trainer = config.build() policy = trainer.get_policy() try: From da26714a5d408416a80c8bae37ad1ae11eecf203 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 1 May 2022 22:14:52 +0200 Subject: [PATCH 2/5] wip --- rllib/__init__.py | 2 +- rllib/agents/impala/impala.py | 4 +- rllib/agents/ppo/__init__.py | 3 +- rllib/agents/ppo/appo.py | 210 ++++++++++++++++++++-------- rllib/agents/ppo/tests/test_appo.py | 60 ++++---- 5 files changed, 178 insertions(+), 101 deletions(-) diff --git a/rllib/__init__.py b/rllib/__init__.py index a8867c09bf1d..1ac2ece81b4e 100644 --- a/rllib/__init__.py +++ b/rllib/__init__.py @@ -57,7 +57,7 @@ def setup(self, config): _setup_logger() -usage_lib.record_library_usage("rllib") +# usage_lib.record_library_usage("rllib") __all__ = [ "Policy", diff --git a/rllib/agents/impala/impala.py b/rllib/agents/impala/impala.py index a1af09ad3275..6e500714b021 100644 --- a/rllib/agents/impala/impala.py +++ b/rllib/agents/impala/impala.py @@ -60,9 +60,9 @@ class ImpalaConfig(TrainerConfig): ... ) """ - def __init__(self): + def __init__(self, trainer_class=None): """Initializes a ImpalaConfig instance.""" - super().__init__(trainer_class=ImpalaTrainer) + super().__init__(trainer_class=trainer_class or ImpalaTrainer) # fmt: off # __sphinx_doc_begin__ diff --git a/rllib/agents/ppo/__init__.py b/rllib/agents/ppo/__init__.py index dca9f385fde0..3a8c0a20e486 100644 --- a/rllib/agents/ppo/__init__.py +++ b/rllib/agents/ppo/__init__.py @@ -1,10 +1,11 @@ from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer, DEFAULT_CONFIG from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy -from ray.rllib.agents.ppo.appo import APPOTrainer +from ray.rllib.agents.ppo.appo import APPOConfig, APPOTrainer from ray.rllib.agents.ppo.ddppo import DDPPOTrainer __all__ = [ + "APPOConfig", "APPOTrainer", "DDPPOTrainer", "DEFAULT_CONFIG", diff --git a/rllib/agents/ppo/appo.py b/rllib/agents/ppo/appo.py index 2ef0f9a88e26..c901c6d40b6d 100644 --- a/rllib/agents/ppo/appo.py +++ b/rllib/agents/ppo/appo.py @@ -23,69 +23,140 @@ _get_shared_metrics, ) from ray.rllib.utils.annotations import override +from ray.rllib.utils.deprecation import Deprecated from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict -# fmt: off -# __sphinx_doc_begin__ - -# Adds the following updates to the `IMPALATrainer` config in -# rllib/agents/impala/impala.py. -DEFAULT_CONFIG = impala.ImpalaTrainer.merge_trainer_configs( - impala.DEFAULT_CONFIG, # See keys in impala.py, which are also supported. - { - # Whether to use V-trace weighted advantages. If false, PPO GAE - # advantages will be used instead. - "vtrace": True, - - # == These two options only apply if vtrace: False == - # Should use a critic as a baseline (otherwise don't use value - # baseline; required for using GAE). - "use_critic": True, - # If true, use the Generalized Advantage Estimator (GAE) - # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. - "use_gae": True, - # GAE(lambda) parameter - "lambda": 1.0, - - # == PPO surrogate loss options == - "clip_param": 0.4, - - # == PPO KL Loss options == - "use_kl_loss": False, - "kl_coeff": 1.0, - "kl_target": 0.01, - - # == IMPALA optimizer params (see documentation in impala.py) == - "rollout_fragment_length": 50, - "train_batch_size": 500, - "min_time_s_per_reporting": 10, - "num_workers": 2, - "num_gpus": 0, - "num_multi_gpu_tower_stacks": 1, - "minibatch_buffer_size": 1, - "num_sgd_iter": 1, - "replay_proportion": 0.0, - "replay_buffer_num_slots": 100, - "learner_queue_size": 16, - "learner_queue_timeout": 300, - "max_sample_requests_in_flight_per_worker": 2, - "broadcast_interval": 1, - "grad_clip": 40.0, - "opt_type": "adam", - "lr": 0.0005, - "lr_schedule": None, - "decay": 0.99, - "momentum": 0.0, - "epsilon": 0.1, - "vf_loss_coeff": 0.5, - "entropy_coeff": 0.01, - "entropy_coeff_schedule": None, - }, - _allow_unknown_configs=True, -) -# __sphinx_doc_end__ -# fmt: on +class APPOConfig(impala.ImpalaConfig): + """Defines a A2CTrainer configuration class from which a new Trainer can be built. + + Example: + >>> from ray import tune + >>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\ + ... .resources(num_gpus=1)\ + ... .rollouts(num_rollout_workers=16) + >>> print(config.to_dict()) + >>> # Build a Trainer object from the config and run 1 training iteration. + >>> trainer = config.build(env="CartPole-v1") + >>> trainer.train() + + Example: + >>> config = APPOConfig() + >>> # Print out some default values. + >>> print(config.sample_async) + >>> # Update the config object. + >>> config.training(lr=tune.grid_search([0.001, 0.0001])) + >>> # Set the config object's env. + >>> config.environment(env="CartPole-v1") + >>> # Use to_dict() to get the old-style python config dict + >>> # when running with tune. + >>> tune.run( + ... "APPO", + ... stop={"episode_reward_mean": 200}, + ... config=config.to_dict(), + ... ) + """ + + def __init__(self, trainer_class=None): + """Initializes a APPOConfig instance.""" + super().__init__(trainer_class=trainer_class or APPOTrainer) + + # fmt: off + # __sphinx_doc_begin__ + + # APPO specific settings: + self.vtrace = True + self.use_critic = True + self.use_gae = True + self.lambda_ = 1.0 + self.clip_param = 0.4 + self.use_kl_loss = False + self.kl_coeff = 1.0 + self.kl_target = 0.01 + + # Override some of ImpalaConfig's default values with APPO-specific values. + self.rollout_fragment_length = 50 + self.train_batch_size = 500 + self.min_time_s_per_reporting = 10 + self.num_workers = 2 + self.num_gpus = 0 + self.num_multi_gpu_tower_stacks = 1 + self.minibatch_buffer_size = 1 + self.num_sgd_iter = 1 + self.replay_proportion = 0.0 + self.replay_buffer_num_slots = 100 + self.learner_queue_size = 16 + self.learner_queue_timeout = 300 + self.max_sample_requests_in_flight_per_worker = 2 + self.broadcast_interval = 1 + self.grad_clip = 40.0 + self.opt_type = "adam" + self.lr = 0.0005 + self.lr_schedule = None + self.decay = 0.99 + self.momentum = 0.0 + self.epsilon = 0.1 + self.vf_loss_coeff = 0.5 + self.entropy_coeff = 0.01 + self.entropy_coeff_schedule = None + # __sphinx_doc_end__ + # fmt: on + + @override(impala.ImpalaConfig) + def training( + self, + *, + vtrace: Optional[bool] = None, + use_critic: Optional[bool] = None, + use_gae: Optional[bool] = None, + lambda_: Optional[float] = None, + clip_param: Optional[float] = None, + use_kl_loss: Optional[bool] = None, + kl_coeff: Optional[float] = None, + kl_target: Optional[float] = None, + **kwargs, + ) -> "APPOConfig": + """Sets the training related configuration. + + Args: + vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE + advantages will be used instead. + use_critic: Should use a critic as a baseline (otherwise don't use value + baseline; required for using GAE). Only applies if vtrace=False. + use_gae: If true, use the Generalized Advantage Estimator (GAE) + with a value function, see https://arxiv.org/pdf/1506.02438.pdf. + Only applies if vtrace=False. + lambda_: GAE (lambda) parameter. + clip_param: PPO surrogate slipping parameter. + use_kl_loss: Whether to use the KL-term in the loss function. + kl_coeff: Coefficient for weighting the KL-loss term. + kl_target: Target term for the KL-term to reach (via adjusting the + `kl_coeff` automatically). + + Returns: + This updated TrainerConfig object. + """ + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if vtrace is not None: + self.vtrace = vtrace + if use_critic is not None: + self.use_critic = use_critic + if use_gae is not None: + self.use_gae = use_gae + if lambda_ is not None: + self.lambda_ = lambda_ + if clip_param is not None: + self.clip_param = clip_param + if use_kl_loss is not None: + self.use_kl_loss = use_kl_loss + if kl_coeff is not None: + self.kl_coeff = kl_coeff + if kl_target is not None: + self.kl_target = kl_target + + return self class UpdateTargetAndKL: @@ -130,7 +201,7 @@ def __init__(self, config, *args, **kwargs): @classmethod @override(Trainer) def get_default_config(cls) -> TrainerConfigDict: - return DEFAULT_CONFIG + return APPOConfig().to_dict() @override(Trainer) def get_default_policy_class( @@ -142,3 +213,20 @@ def get_default_policy_class( return AsyncPPOTorchPolicy else: return AsyncPPOTFPolicy + + +# Deprecated: Use ray.rllib.agents.a3c.A3CConfig instead! +class _deprecated_default_config(dict): + def __init__(self): + super().__init__(APPOConfig().to_dict()) + + @Deprecated( + old="ray.rllib.agents.ppo.appo.DEFAULT_CONFIG", + new="ray.rllib.agents.ppo.appo.APPOConfig(...)", + error=False, + ) + def __getitem__(self, item): + return super().__getitem__(item) + + +DEFAULT_CONFIG = _deprecated_default_config() diff --git a/rllib/agents/ppo/tests/test_appo.py b/rllib/agents/ppo/tests/test_appo.py index 551a45b5788d..f96970a1e16d 100644 --- a/rllib/agents/ppo/tests/test_appo.py +++ b/rllib/agents/ppo/tests/test_appo.py @@ -22,15 +22,13 @@ def tearDownClass(cls): def test_appo_compilation(self): """Test whether an APPOTrainer can be built with both frameworks.""" - config = ppo.appo.DEFAULT_CONFIG.copy() - config["num_workers"] = 1 + config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1) num_iterations = 2 for _ in framework_iterator(config, with_eager_tracing=True): print("w/o v-trace") - _config = config.copy() - _config["vtrace"] = False - trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0") + config.vtrace = False + trainer = config.build(env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) @@ -39,9 +37,8 @@ def test_appo_compilation(self): trainer.stop() print("w/ v-trace") - _config = config.copy() - _config["vtrace"] = True - trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0") + config.vtrace = True + trainer = config.build(env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) @@ -51,13 +48,11 @@ def test_appo_compilation(self): def test_appo_compilation_use_kl_loss(self): """Test whether an APPOTrainer can be built with kl_loss enabled.""" - config = ppo.appo.DEFAULT_CONFIG.copy() - config["num_workers"] = 1 - config["use_kl_loss"] = True + config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training(use_kl_loss=True) num_iterations = 2 for _ in framework_iterator(config, with_eager_tracing=True): - trainer = ppo.APPOTrainer(config=config, env="CartPole-v0") + trainer = config.build(env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) @@ -66,22 +61,19 @@ def test_appo_compilation_use_kl_loss(self): trainer.stop() def test_appo_two_tf_optimizers(self): - config = ppo.appo.DEFAULT_CONFIG.copy() - config["num_workers"] = 1 - # Not explicitly setting this should cause a warning, but not fail. # config["_tf_policy_handles_more_than_one_loss"] = True - config["_separate_vf_optimizer"] = True - config["_lr_vf"] = 0.0002 - + config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training( + _separate_vf_optimizer=True, _lr_vf=0.002) # Make sure we have two completely separate models for policy and # value function. - config["model"]["vf_share_layers"] = False + config.model["vf_share_layers"] = False + num_iterations = 2 # Only supported for tf so far. for _ in framework_iterator(config, frameworks=("tf2", "tf")): - trainer = ppo.APPOTrainer(config=config, env="CartPole-v0") + trainer = config.build(env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) @@ -90,23 +82,19 @@ def test_appo_two_tf_optimizers(self): trainer.stop() def test_appo_entropy_coeff_schedule(self): - config = ppo.appo.DEFAULT_CONFIG.copy() - config["num_workers"] = 1 - config["num_gpus"] = 0 - config["train_batch_size"] = 20 - config["batch_mode"] = "truncate_episodes" - config["rollout_fragment_length"] = 10 - config["timesteps_per_iteration"] = 20 + # Initial lr, doesn't really matter because of the schedule below. + config = ppo.appo.APPOConfig().\ + rollouts(num_rollout_workers=1, batch_mode="truncate_episodes", rollout_fragment_length=10).\ + resources(num_gpus=0).\ + training(train_batch_size=20, entropy_coeff=0.01, entropy_coeff_schedule=[ + [0, 0.01], + [120, 0.0001], + ]) + + config.min_sample_timesteps_per_reporting = 20 # 0 metrics reporting delay, this makes sure timestep, # which entropy coeff depends on, is updated after each worker rollout. - config["min_time_s_per_reporting"] = 0 - # Initial lr, doesn't really matter because of the schedule below. - config["entropy_coeff"] = 0.01 - schedule = [ - [0, 0.01], - [120, 0.0001], - ] - config["entropy_coeff_schedule"] = schedule + config.min_time_s_per_reporting = 0 def _step_n_times(trainer, n: int): """Step trainer n times. @@ -121,7 +109,7 @@ def _step_n_times(trainer, n: int): ] for _ in framework_iterator(config): - trainer = ppo.APPOTrainer(config=config, env="CartPole-v0") + trainer = config.build(env="CartPole-v0") coeff = _step_n_times(trainer, 1) # 20 timesteps # Should be close to the starting coeff of 0.01. From 21bf876640cc9d2c52e740cb959bce12444269a0 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 2 May 2022 10:25:23 +0200 Subject: [PATCH 3/5] wip --- rllib/agents/impala/impala.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rllib/agents/impala/impala.py b/rllib/agents/impala/impala.py index a1af09ad3275..5c0072ac29d1 100644 --- a/rllib/agents/impala/impala.py +++ b/rllib/agents/impala/impala.py @@ -34,6 +34,7 @@ class ImpalaConfig(TrainerConfig): """Defines an ARSTrainer configuration class from which an ImpalaTrainer can be built. Example: + >>> from ray.rllib.agents.impala import ImpalaConfig >>> config = ImpalaConfig().training(lr=0.0003, train_batch_size=512)\ ... .resources(num_gpus=4)\ ... .rollouts(num_rollout_workers=64) @@ -43,6 +44,7 @@ class ImpalaConfig(TrainerConfig): >>> trainer.train() Example: + >>> from ray.rllib.agents.impala import ImpalaConfig >>> from ray import tune >>> config = ImpalaConfig() >>> # Print out some default values. From c629956fbfaba88319dd9ca31ee9f27cd72cd916 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 2 May 2022 10:45:21 +0200 Subject: [PATCH 4/5] wip --- rllib/agents/ppo/appo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rllib/agents/ppo/appo.py b/rllib/agents/ppo/appo.py index c901c6d40b6d..eae0f065d2c4 100644 --- a/rllib/agents/ppo/appo.py +++ b/rllib/agents/ppo/appo.py @@ -28,10 +28,10 @@ class APPOConfig(impala.ImpalaConfig): - """Defines a A2CTrainer configuration class from which a new Trainer can be built. + """Defines a APPOTrainer configuration class from which a new Trainer can be built. Example: - >>> from ray import tune + >>> from ray.rllib.agents.ppo import APPOConfig >>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\ ... .resources(num_gpus=1)\ ... .rollouts(num_rollout_workers=16) @@ -41,6 +41,8 @@ class APPOConfig(impala.ImpalaConfig): >>> trainer.train() Example: + >>> from ray.rllib.agents.ppo import APPOConfig + >>> from ray import tune >>> config = APPOConfig() >>> # Print out some default values. >>> print(config.sample_async) @@ -215,7 +217,7 @@ def get_default_policy_class( return AsyncPPOTFPolicy -# Deprecated: Use ray.rllib.agents.a3c.A3CConfig instead! +# Deprecated: Use ray.rllib.agents.ppo.APPOConfig instead! class _deprecated_default_config(dict): def __init__(self): super().__init__(APPOConfig().to_dict()) From 7b17862d919b3868268b03beebf2427550b8090f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 2 May 2022 12:11:25 +0200 Subject: [PATCH 5/5] wip --- rllib/__init__.py | 2 +- rllib/agents/ppo/tests/test_appo.py | 37 +++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/rllib/__init__.py b/rllib/__init__.py index 1ac2ece81b4e..a8867c09bf1d 100644 --- a/rllib/__init__.py +++ b/rllib/__init__.py @@ -57,7 +57,7 @@ def setup(self, config): _setup_logger() -# usage_lib.record_library_usage("rllib") +usage_lib.record_library_usage("rllib") __all__ = [ "Policy", diff --git a/rllib/agents/ppo/tests/test_appo.py b/rllib/agents/ppo/tests/test_appo.py index f96970a1e16d..1ddabc9f13ab 100644 --- a/rllib/agents/ppo/tests/test_appo.py +++ b/rllib/agents/ppo/tests/test_appo.py @@ -48,7 +48,11 @@ def test_appo_compilation(self): def test_appo_compilation_use_kl_loss(self): """Test whether an APPOTrainer can be built with kl_loss enabled.""" - config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training(use_kl_loss=True) + config = ( + ppo.appo.APPOConfig() + .rollouts(num_rollout_workers=1) + .training(use_kl_loss=True) + ) num_iterations = 2 for _ in framework_iterator(config, with_eager_tracing=True): @@ -63,8 +67,11 @@ def test_appo_compilation_use_kl_loss(self): def test_appo_two_tf_optimizers(self): # Not explicitly setting this should cause a warning, but not fail. # config["_tf_policy_handles_more_than_one_loss"] = True - config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training( - _separate_vf_optimizer=True, _lr_vf=0.002) + config = ( + ppo.appo.APPOConfig() + .rollouts(num_rollout_workers=1) + .training(_separate_vf_optimizer=True, _lr_vf=0.002) + ) # Make sure we have two completely separate models for policy and # value function. config.model["vf_share_layers"] = False @@ -83,13 +90,23 @@ def test_appo_two_tf_optimizers(self): def test_appo_entropy_coeff_schedule(self): # Initial lr, doesn't really matter because of the schedule below. - config = ppo.appo.APPOConfig().\ - rollouts(num_rollout_workers=1, batch_mode="truncate_episodes", rollout_fragment_length=10).\ - resources(num_gpus=0).\ - training(train_batch_size=20, entropy_coeff=0.01, entropy_coeff_schedule=[ - [0, 0.01], - [120, 0.0001], - ]) + config = ( + ppo.appo.APPOConfig() + .rollouts( + num_rollout_workers=1, + batch_mode="truncate_episodes", + rollout_fragment_length=10, + ) + .resources(num_gpus=0) + .training( + train_batch_size=20, + entropy_coeff=0.01, + entropy_coeff_schedule=[ + [0, 0.01], + [120, 0.0001], + ], + ) + ) config.min_sample_timesteps_per_reporting = 20 # 0 metrics reporting delay, this makes sure timestep,