diff --git a/rllib/algorithms/ddpg/tests/test_ddpg.py b/rllib/algorithms/ddpg/tests/test_ddpg.py index 4045dcf78fcf..c15caf296c91 100644 --- a/rllib/algorithms/ddpg/tests/test_ddpg.py +++ b/rllib/algorithms/ddpg/tests/test_ddpg.py @@ -1,7 +1,6 @@ import numpy as np import re import unittest -from tempfile import TemporaryDirectory import ray import ray.rllib.algorithms.ddpg as ddpg @@ -64,23 +63,6 @@ def test_ddpg_compilation(self): check(a, 500) trainer.stop() - def test_ddpg_checkpoint_save_and_restore(self): - """Test whether a DDPGTrainer can save and load checkpoints.""" - config = ddpg.DEFAULT_CONFIG.copy() - config["num_workers"] = 1 - config["num_envs_per_worker"] = 2 - config["replay_buffer_config"]["learning_starts"] = 0 - config["exploration_config"]["random_timesteps"] = 100 - - # Test against all frameworks. - for _ in framework_iterator(config, with_eager_tracing=True): - trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v1") - trainer.train() - with TemporaryDirectory() as temp_dir: - checkpoint = trainer.save(temp_dir) - trainer.restore(checkpoint) - trainer.stop() - def test_ddpg_exploration_and_with_random_prerun(self): """Tests DDPG's Exploration (w/ random actions for n timesteps).""" diff --git a/rllib/algorithms/maddpg/__init__.py b/rllib/algorithms/maddpg/__init__.py index 2ae788f1ebd6..4de518de130a 100644 --- a/rllib/algorithms/maddpg/__init__.py +++ b/rllib/algorithms/maddpg/__init__.py @@ -1,3 +1,7 @@ -from ray.rllib.algorithms.maddpg.maddpg import MADDPGTrainer, DEFAULT_CONFIG +from ray.rllib.algorithms.maddpg.maddpg import ( + MADDPGConfig, + MADDPGTrainer, + DEFAULT_CONFIG, +) -__all__ = ["MADDPGTrainer", "DEFAULT_CONFIG"] +__all__ = ["MADDPGConfig", "MADDPGTrainer", "DEFAULT_CONFIG"] diff --git a/rllib/algorithms/maddpg/maddpg.py b/rllib/algorithms/maddpg/maddpg.py index e63321586169..53fb81eccf9c 100644 --- a/rllib/algorithms/maddpg/maddpg.py +++ b/rllib/algorithms/maddpg/maddpg.py @@ -10,115 +10,241 @@ """ import logging -from typing import Type +from typing import List, Optional, Type +from ray.rllib.agents.trainer_config import TrainerConfig from ray.rllib.algorithms.dqn.dqn import DQNTrainer from ray.rllib.algorithms.maddpg.maddpg_tf_policy import MADDPGTFPolicy -from ray.rllib.agents.trainer import with_common_config from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch -from ray.rllib.utils.annotations import override +from ray.rllib.utils.annotations import Deprecated, override from ray.rllib.utils.typing import TrainerConfigDict from ray.rllib.utils.deprecation import DEPRECATED_VALUE logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# fmt: off -# __sphinx_doc_begin__ -DEFAULT_CONFIG = with_common_config({ - # === Framework to run the algorithm === - "framework": "tf", - - # === Settings for each individual policy === - # ID of the agent controlled by this policy - "agent_id": None, - # Use a local critic for this policy. - "use_local_critic": False, - - # === Evaluation === - # Evaluation interval - "evaluation_interval": None, - # Number of episodes to run per evaluation period. - "evaluation_duration": 10, - - # === Model === - # Apply a state preprocessor with spec given by the "model" config option - # (like other RL algorithms). This is mostly useful if you have a weird - # observation shape, like an image. Disabled by default. - "use_state_preprocessor": False, - # Postprocess the policy network model output with these hidden layers. If - # use_state_preprocessor is False, then these will be the *only* hidden - # layers in the network. - "actor_hiddens": [64, 64], - # Hidden layers activation of the postprocessing stage of the policy - # network - "actor_hidden_activation": "relu", - # Postprocess the critic network model output with these hidden layers; - # again, if use_state_preprocessor is True, then the state will be - # preprocessed by the model specified with the "model" config option first. - "critic_hiddens": [64, 64], - # Hidden layers activation of the postprocessing state of the critic. - "critic_hidden_activation": "relu", - # N-step Q learning - "n_step": 1, - # Algorithm for good policies. - "good_policy": "maddpg", - # Algorithm for adversary policies. - "adv_policy": "maddpg", - - # === Replay buffer === - "replay_buffer_config": { - "type": "MultiAgentReplayBuffer", - # Specify prioritized replay by supplying a buffer type that supports - # prioritization, for example: MultiAgentPrioritizedReplayBuffer. - "prioritized_replay": DEPRECATED_VALUE, - "capacity": int(1e6), - # How many steps of the model to sample before learning starts. - "learning_starts": 1024 * 25, - # Force lockstep replay mode for MADDPG. - "replay_mode": "lockstep", - }, - # Observation compression. Note that compression makes simulation slow in - # MPE. - "compress_observations": False, - # If set, this will fix the ratio of replayed from a buffer and learned on - # timesteps to sampled from an environment and stored in the replay buffer - # timesteps. Otherwise, the replay will proceed at the native ratio - # determined by (train_batch_size / rollout_fragment_length). - "training_intensity": None, - - # === Optimization === - # Learning rate for the critic (Q-function) optimizer. - "critic_lr": 1e-2, - # Learning rate for the actor (policy) optimizer. - "actor_lr": 1e-2, - # Update the target network every `target_network_update_freq` sample steps. - "target_network_update_freq": 0, - # Update the target by \tau * policy + (1-\tau) * target_policy - "tau": 0.01, - # Weights for feature regularization for the actor - "actor_feature_reg": 0.001, - # If not None, clip gradients during optimization at this value - "grad_norm_clipping": 0.5, - # Update the replay buffer with this many samples at once. Note that this - # setting applies per-worker if num_workers > 1. - "rollout_fragment_length": 100, - # Size of a batched sampled from replay buffer for training. Note that - # if async_updates is set, then each worker returns gradients for a - # batch of this size. - "train_batch_size": 1024, - - # === Parallelism === - # Number of workers for collecting samples with. This only makes sense - # to increase if your environment is particularly slow to sample, or if - # you're using the Async or Ape-X optimizers. - "num_workers": 1, - # Prevent iterations from going lower than this time span - "min_time_s_per_reporting": 0, -}) -# __sphinx_doc_end__ -# fmt: on + +class MADDPGConfig(TrainerConfig): + """Defines a configuration class from which a MADDPGTrainer can be built. + + Example: + >>> from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig + >>> config = MADDPGConfig() + >>> print(config.replay_buffer_config) + >>> replay_config = config.replay_buffer_config.update( + >>> { + >>> "capacity": 100000, + >>> "prioritized_replay_alpha": 0.8, + >>> "prioritized_replay_beta": 0.45, + >>> "prioritized_replay_eps": 2e-6, + >>> } + >>> ) + >>> config.training(replay_buffer_config=replay_config)\ + >>> .resources(num_gpus=0)\ + >>> .rollouts(num_rollout_workers=4)\ + >>> .environment("CartPole-v1") + >>> trainer = config.build() + >>> while True: + >>> trainer.train() + + Example: + >>> from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig + >>> from ray import tune + >>> config = MADDPGConfig() + >>> config.training(n_step=tune.grid_search([3, 5])) + >>> config.environment(env="CartPole-v1") + >>> tune.run( + >>> "MADDPG", + >>> stop={"episode_reward_mean":200}, + >>> config=config.to_dict() + >>> ) + """ + + def __init__(self, trainer_class=None): + """Initializes a DQNConfig instance.""" + super().__init__(trainer_class=trainer_class or MADDPGTrainer) + + # fmt: off + # __sphinx_doc_begin__ + # MADDPG specific config settings: + self.agent_id = None + self.use_local_critic = False + self.use_state_preprocessor = False + self.actor_hiddens = [64, 64] + self.actor_hidden_activation = "relu" + self.critic_hiddens = [64, 64] + self.critic_hidden_activation = "relu" + self.n_step = 1 + self.good_policy = "maddpg" + self.adv_policy = "maddpg" + self.replay_buffer_config = { + "type": "MultiAgentReplayBuffer", + # Specify prioritized replay by supplying a buffer type that supports + # prioritization, for example: MultiAgentPrioritizedReplayBuffer. + "prioritized_replay": DEPRECATED_VALUE, + "capacity": int(1e6), + # How many steps of the model to sample before learning starts. + "learning_starts": 1024 * 25, + # Force lockstep replay mode for MADDPG. + "replay_mode": "lockstep", + } + self.training_intensity = None + self.critic_lr = 1e-2 + self.actor_lr = 1e-2 + self.target_network_update_freq = 0 + self.tau = 0.01 + self.actor_feature_reg = 0.001 + self.grad_norm_clipping = 0.5 + + # Changes to Trainer's default: + self.rollout_fragment_length = 100 + self.train_batch_size = 1024 + self.num_workers = 1 + self.min_time_s_per_reporting = 0 + # fmt: on + # __sphinx_doc_end__ + + @override(TrainerConfig) + def training( + self, + *, + agent_id: Optional[str] = None, + use_local_critic: Optional[bool] = None, + use_state_preprocessor: Optional[bool] = None, + actor_hiddens: Optional[List[int]] = None, + actor_hidden_activation: Optional[str] = None, + critic_hiddens: Optional[List[int]] = None, + critic_hidden_activation: Optional[str] = None, + n_step: Optional[int] = None, + good_policy: Optional[str] = None, + adv_policy: Optional[str] = None, + replay_buffer_config: Optional[dict] = None, + training_intensity: Optional[float] = None, + critic_lr: Optional[float] = None, + actor_lr: Optional[float] = None, + target_network_update_freq: Optional[int] = None, + tau: Optional[float] = None, + actor_feature_reg: Optional[float] = None, + grad_norm_clipping: Optional[float] = None, + **kwargs, + ) -> "MADDPGConfig": + """Sets the training related configuration. + + Args: + agent_id: ID of the agent controlled by this policy. + use_local_critic: Use a local critic for this policy. + use_state_preprocessor: Apply a state preprocessor with spec given by the + "model" config option (like other RL algorithms). This is mostly useful + if you have a weird observation shape, like an image. Disabled by + default. + actor_hiddens: Postprocess the policy network model output with these hidden + layers. If `use_state_preprocessor` is False, then these will be the + *only* hidden layers in the network. + actor_hidden_activation: Hidden layers activation of the postprocessing + stage of the policy network. + critic_hiddens: Postprocess the critic network model output with these + hidden layers; again, if use_state_preprocessor is True, then the state + will be preprocessed by the model specified with the "model" config + option first. + critic_hidden_activation: Hidden layers activation of the postprocessing + state of the critic. + n_step: N-step for Q-learning. + good_policy: Algorithm for good policies. + adv_policy: Algorithm for adversary policies. + replay_buffer_config: Replay buffer config. + Examples: + { + "_enable_replay_buffer_api": True, + "type": "MultiAgentReplayBuffer", + "learning_starts": 1000, + "capacity": 50000, + "replay_sequence_length": 1, + } + - OR - + { + "_enable_replay_buffer_api": True, + "type": "MultiAgentPrioritizedReplayBuffer", + "capacity": 50000, + "prioritized_replay_alpha": 0.6, + "prioritized_replay_beta": 0.4, + "prioritized_replay_eps": 1e-6, + "replay_sequence_length": 1, + } + - Where - + prioritized_replay_alpha: Alpha parameter controls the degree of + prioritization in the buffer. In other words, when a buffer sample has + a higher temporal-difference error, with how much more probability + should it drawn to use to update the parametrized Q-network. 0.0 + corresponds to uniform probability. Setting much above 1.0 may quickly + result as the sampling distribution could become heavily “pointy” with + low entropy. + prioritized_replay_beta: Beta parameter controls the degree of + importance sampling which suppresses the influence of gradient updates + from samples that have higher probability of being sampled via alpha + parameter and the temporal-difference error. + prioritized_replay_eps: Epsilon parameter sets the baseline probability + for sampling so that when the temporal-difference error of a sample is + zero, there is still a chance of drawing the sample. + training_intensity: If set, this will fix the ratio of replayed from a + buffer and learned on timesteps to sampled from an environment and + stored in the replay buffer timesteps. Otherwise, the replay will + proceed at the native ratio determined by + `(train_batch_size / rollout_fragment_length)`. + critic_lr: Learning rate for the critic (Q-function) optimizer. + actor_lr: Learning rate for the actor (policy) optimizer. + target_network_update_freq: Update the target network every + `target_network_update_freq` sample steps. + tau: Update the target by \tau * policy + (1-\tau) * target_policy. + actor_feature_reg: Weights for feature regularization for the actor. + grad_norm_clipping: If not None, clip gradients during optimization at this + value. + + Returns: + This updated TrainerConfig object. + """ + + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if agent_id is not None: + self.agent_id = agent_id + if use_local_critic is not None: + self.use_local_critic = use_local_critic + if use_state_preprocessor is not None: + self.use_state_preprocessor = use_state_preprocessor + if actor_hiddens is not None: + self.actor_hiddens = actor_hiddens + if actor_hidden_activation is not None: + self.actor_hidden_activation = actor_hidden_activation + if critic_hiddens is not None: + self.critic_hiddens = critic_hiddens + if critic_hidden_activation is not None: + self.critic_hidden_activation = critic_hidden_activation + if n_step is not None: + self.n_step = n_step + if good_policy is not None: + self.good_policy = good_policy + if adv_policy is not None: + self.adv_policy = adv_policy + if replay_buffer_config is not None: + self.replay_buffer_config = replay_buffer_config + if training_intensity is not None: + self.training_intensity = training_intensity + if critic_lr is not None: + self.critic_lr = critic_lr + if actor_lr is not None: + self.actor_lr = actor_lr + if target_network_update_freq is not None: + self.target_network_update_freq = target_network_update_freq + if tau is not None: + self.tau = tau + if actor_feature_reg is not None: + self.actor_feature_reg = actor_feature_reg + if grad_norm_clipping is not None: + self.grad_norm_clipping = grad_norm_clipping + + return self def before_learn_on_batch(multi_agent_batch, policies, train_batch_size): @@ -152,7 +278,7 @@ class MADDPGTrainer(DQNTrainer): @classmethod @override(DQNTrainer) def get_default_config(cls) -> TrainerConfigDict: - return DEFAULT_CONFIG + return MADDPGConfig().to_dict() @override(DQNTrainer) def validate_config(self, config: TrainerConfigDict) -> None: @@ -175,3 +301,20 @@ def f(batch, workers, config): @override(DQNTrainer) def get_default_policy_class(self, config: TrainerConfigDict) -> Type[Policy]: return MADDPGTFPolicy + + +# Deprecated: Use ray.rllib.algorithms.maddpg.MADDPG instead! +class _deprecated_default_config(dict): + def __init__(self): + super().__init__(MADDPGConfig().to_dict()) + + @Deprecated( + old="ray.rllib.algorithms.maddpg.maddpg.DEFAULT_CONFIG", + new="ray.rllib.algorithms.maddpg.maddpg.MADDPGConfig(...)", + error=False, + ) + def __getitem__(self, item): + return super().__getitem__(item) + + +DEFAULT_CONFIG = _deprecated_default_config() diff --git a/rllib/algorithms/maddpg/tests/test_maddpg.py b/rllib/algorithms/maddpg/tests/test_maddpg.py index c6181f7822be..f6e24645a546 100644 --- a/rllib/algorithms/maddpg/tests/test_maddpg.py +++ b/rllib/algorithms/maddpg/tests/test_maddpg.py @@ -21,28 +21,32 @@ def tearDownClass(cls) -> None: def test_maddpg_compilation(self): """Test whether an MADDPGTrainer can be built with all frameworks.""" - config = maddpg.DEFAULT_CONFIG.copy() - config["env"] = TwoStepGame - config["env_config"] = { - "actions_are_logits": True, - } - config["multiagent"] = { - "policies": { - "pol1": PolicySpec( - config={"agent_id": 0}, - ), - "pol2": PolicySpec( - config={"agent_id": 1}, - ), - }, - "policy_mapping_fn": (lambda aid, **kwargs: "pol2" if aid else "pol1"), - } + config = ( + maddpg.MADDPGConfig() + .environment( + env=TwoStepGame, + env_config={ + "actions_are_logits": True, + }, + ) + .multi_agent( + policies={ + "pol1": PolicySpec( + config={"agent_id": 0}, + ), + "pol2": PolicySpec( + config={"agent_id": 1}, + ), + }, + policy_mapping_fn=lambda aid, **kwargs: "pol2" if aid else "pol1", + ) + ) num_iterations = 1 # Only working for tf right now. for _ in framework_iterator(config, frameworks="tf"): - trainer = maddpg.MADDPGTrainer(config) + trainer = config.build() for i in range(num_iterations): results = trainer.train() check_train_results(results)