ray-project · sven1977 · May 2, 2022 · May 1, 2022 · May 1, 2022 · May 2, 2022
@@ -62,9 +62,9 @@ class ImpalaConfig(TrainerConfig):
         ... )
     """
 
-    def __init__(self):
+    def __init__(self, trainer_class=None):
         """Initializes a ImpalaConfig instance."""
-        super().__init__(trainer_class=ImpalaTrainer)
+        super().__init__(trainer_class=trainer_class or ImpalaTrainer)
 
         # fmt: off
         # __sphinx_doc_begin__

@@ -1,10 +1,11 @@
 from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
 from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
-from ray.rllib.agents.ppo.appo import APPOTrainer
+from ray.rllib.agents.ppo.appo import APPOConfig, APPOTrainer
 from ray.rllib.agents.ppo.ddppo import DDPPOTrainer
 
 __all__ = [
+    "APPOConfig",
     "APPOTrainer",
     "DDPPOTrainer",
     "DEFAULT_CONFIG",

@@ -23,69 +23,142 @@
     _get_shared_metrics,
 )
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.deprecation import Deprecated
 from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict
 
-# fmt: off
-# __sphinx_doc_begin__
-
-# Adds the following updates to the `IMPALATrainer` config in
-# rllib/agents/impala/impala.py.
-DEFAULT_CONFIG = impala.ImpalaTrainer.merge_trainer_configs(
-    impala.DEFAULT_CONFIG,  # See keys in impala.py, which are also supported.
-    {
-        # Whether to use V-trace weighted advantages. If false, PPO GAE
-        # advantages will be used instead.
-        "vtrace": True,
-
-        # == These two options only apply if vtrace: False ==
-        # Should use a critic as a baseline (otherwise don't use value
-        # baseline; required for using GAE).
-        "use_critic": True,
-        # If true, use the Generalized Advantage Estimator (GAE)
-        # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
-        "use_gae": True,
-        # GAE(lambda) parameter
-        "lambda": 1.0,
-
-        # == PPO surrogate loss options ==
-        "clip_param": 0.4,
-
-        # == PPO KL Loss options ==
-        "use_kl_loss": False,
-        "kl_coeff": 1.0,
-        "kl_target": 0.01,
-
-        # == IMPALA optimizer params (see documentation in impala.py) ==
-        "rollout_fragment_length": 50,
-        "train_batch_size": 500,
-        "min_time_s_per_reporting": 10,
-        "num_workers": 2,
-        "num_gpus": 0,
-        "num_multi_gpu_tower_stacks": 1,
-        "minibatch_buffer_size": 1,
-        "num_sgd_iter": 1,
-        "replay_proportion": 0.0,
-        "replay_buffer_num_slots": 100,
-        "learner_queue_size": 16,
-        "learner_queue_timeout": 300,
-        "max_sample_requests_in_flight_per_worker": 2,
-        "broadcast_interval": 1,
-        "grad_clip": 40.0,
-        "opt_type": "adam",
-        "lr": 0.0005,
-        "lr_schedule": None,
-        "decay": 0.99,
-        "momentum": 0.0,
-        "epsilon": 0.1,
-        "vf_loss_coeff": 0.5,
-        "entropy_coeff": 0.01,
-        "entropy_coeff_schedule": None,
-    },
-    _allow_unknown_configs=True,
-)
 
-# __sphinx_doc_end__
-# fmt: on
+class APPOConfig(impala.ImpalaConfig):
+    """Defines a APPOTrainer configuration class from which a new Trainer can be built.
+
+    Example:
+        >>> from ray.rllib.agents.ppo import APPOConfig
+        >>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\
+        ...     .resources(num_gpus=1)\
+        ...     .rollouts(num_rollout_workers=16)
+        >>> print(config.to_dict())
+        >>> # Build a Trainer object from the config and run 1 training iteration.
+        >>> trainer = config.build(env="CartPole-v1")
+        >>> trainer.train()
+
+    Example:
+        >>> from ray.rllib.agents.ppo import APPOConfig
+        >>> from ray import tune
+        >>> config = APPOConfig()
+        >>> # Print out some default values.
+        >>> print(config.sample_async)
+        >>> # Update the config object.
+        >>> config.training(lr=tune.grid_search([0.001, 0.0001]))
+        >>> # Set the config object's env.
+        >>> config.environment(env="CartPole-v1")
+        >>> # Use to_dict() to get the old-style python config dict
+        >>> # when running with tune.
+        >>> tune.run(
+        ...     "APPO",
+        ...     stop={"episode_reward_mean": 200},
+        ...     config=config.to_dict(),
+        ... )
+    """
+
+    def __init__(self, trainer_class=None):
+        """Initializes a APPOConfig instance."""
+        super().__init__(trainer_class=trainer_class or APPOTrainer)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+
+        # APPO specific settings:
+        self.vtrace = True
+        self.use_critic = True
+        self.use_gae = True
+        self.lambda_ = 1.0
+        self.clip_param = 0.4
+        self.use_kl_loss = False
+        self.kl_coeff = 1.0
+        self.kl_target = 0.01
+
+        # Override some of ImpalaConfig's default values with APPO-specific values.
+        self.rollout_fragment_length = 50
+        self.train_batch_size = 500
+        self.min_time_s_per_reporting = 10
+        self.num_workers = 2
+        self.num_gpus = 0
+        self.num_multi_gpu_tower_stacks = 1
+        self.minibatch_buffer_size = 1
+        self.num_sgd_iter = 1
+        self.replay_proportion = 0.0
+        self.replay_buffer_num_slots = 100
+        self.learner_queue_size = 16
+        self.learner_queue_timeout = 300
+        self.max_sample_requests_in_flight_per_worker = 2
+        self.broadcast_interval = 1
+        self.grad_clip = 40.0
+        self.opt_type = "adam"
+        self.lr = 0.0005
+        self.lr_schedule = None
+        self.decay = 0.99
+        self.momentum = 0.0
+        self.epsilon = 0.1
+        self.vf_loss_coeff = 0.5
+        self.entropy_coeff = 0.01
+        self.entropy_coeff_schedule = None
+        # __sphinx_doc_end__
+        # fmt: on
+
+    @override(impala.ImpalaConfig)
+    def training(
+        self,
+        *,
+        vtrace: Optional[bool] = None,
+        use_critic: Optional[bool] = None,
+        use_gae: Optional[bool] = None,
+        lambda_: Optional[float] = None,
+        clip_param: Optional[float] = None,
+        use_kl_loss: Optional[bool] = None,
+        kl_coeff: Optional[float] = None,
+        kl_target: Optional[float] = None,
+        **kwargs,
+    ) -> "APPOConfig":
+        """Sets the training related configuration.
+
+        Args:
+            vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
+                advantages will be used instead.
+            use_critic: Should use a critic as a baseline (otherwise don't use value
+                baseline; required for using GAE). Only applies if vtrace=False.
+            use_gae: If true, use the Generalized Advantage Estimator (GAE)
+                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
+                Only applies if vtrace=False.
+            lambda_: GAE (lambda) parameter.
+            clip_param: PPO surrogate slipping parameter.
+            use_kl_loss: Whether to use the KL-term in the loss function.
+            kl_coeff: Coefficient for weighting the KL-loss term.
+            kl_target: Target term for the KL-term to reach (via adjusting the
+                `kl_coeff` automatically).
+
+        Returns:
+            This updated TrainerConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if vtrace is not None:
+            self.vtrace = vtrace
+        if use_critic is not None:
+            self.use_critic = use_critic
+        if use_gae is not None:
+            self.use_gae = use_gae
+        if lambda_ is not None:
+            self.lambda_ = lambda_
+        if clip_param is not None:
+            self.clip_param = clip_param
+        if use_kl_loss is not None:
+            self.use_kl_loss = use_kl_loss
+        if kl_coeff is not None:
+            self.kl_coeff = kl_coeff
+        if kl_target is not None:
+            self.kl_target = kl_target
+
+        return self
 
 
 class UpdateTargetAndKL:
@@ -130,7 +203,7 @@ def __init__(self, config, *args, **kwargs):
     @classmethod
     @override(Trainer)
     def get_default_config(cls) -> TrainerConfigDict:
-        return DEFAULT_CONFIG
+        return APPOConfig().to_dict()
 
     @override(Trainer)
     def get_default_policy_class(
@@ -142,3 +215,20 @@ def get_default_policy_class(
             return AsyncPPOTorchPolicy
         else:
             return AsyncPPOTFPolicy
+
+
+# Deprecated: Use ray.rllib.agents.ppo.APPOConfig instead!
+class _deprecated_default_config(dict):
+    def __init__(self):
+        super().__init__(APPOConfig().to_dict())
+
+    @Deprecated(
+        old="ray.rllib.agents.ppo.appo.DEFAULT_CONFIG",
+        new="ray.rllib.agents.ppo.appo.APPOConfig(...)",
+        error=False,
+    )
+    def __getitem__(self, item):
+        return super().__getitem__(item)
+
+
+DEFAULT_CONFIG = _deprecated_default_config()
@@ -22,15 +22,13 @@ def tearDownClass(cls):
 
     def test_appo_compilation(self):
         """Test whether an APPOTrainer can be built with both frameworks."""
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
+        config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1)
         num_iterations = 2
 
         for _ in framework_iterator(config, with_eager_tracing=True):
             print("w/o v-trace")
-            _config = config.copy()
-            _config["vtrace"] = False
-            trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0")
+            config.vtrace = False
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -39,9 +37,8 @@ def test_appo_compilation(self):
             trainer.stop()
 
             print("w/ v-trace")
-            _config = config.copy()
-            _config["vtrace"] = True
-            trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0")
+            config.vtrace = True
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -51,13 +48,15 @@ def test_appo_compilation(self):
 
     def test_appo_compilation_use_kl_loss(self):
         """Test whether an APPOTrainer can be built with kl_loss enabled."""
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-        config["use_kl_loss"] = True
+        config = (
+            ppo.appo.APPOConfig()
+            .rollouts(num_rollout_workers=1)
+            .training(use_kl_loss=True)
+        )
         num_iterations = 2
 
         for _ in framework_iterator(config, with_eager_tracing=True):
-            trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -66,22 +65,22 @@ def test_appo_compilation_use_kl_loss(self):
             trainer.stop()
 
     def test_appo_two_tf_optimizers(self):
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-
         # Not explicitly setting this should cause a warning, but not fail.
         # config["_tf_policy_handles_more_than_one_loss"] = True
-        config["_separate_vf_optimizer"] = True
-        config["_lr_vf"] = 0.0002
-
+        config = (
+            ppo.appo.APPOConfig()
+            .rollouts(num_rollout_workers=1)
+            .training(_separate_vf_optimizer=True, _lr_vf=0.002)
+        )
         # Make sure we have two completely separate models for policy and
         # value function.
-        config["model"]["vf_share_layers"] = False
+        config.model["vf_share_layers"] = False
+
         num_iterations = 2
 
         # Only supported for tf so far.
         for _ in framework_iterator(config, frameworks=("tf2", "tf")):
-            trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -90,23 +89,29 @@ def test_appo_two_tf_optimizers(self):
             trainer.stop()
 
     def test_appo_entropy_coeff_schedule(self):
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-        config["num_gpus"] = 0
-        config["train_batch_size"] = 20
-        config["batch_mode"] = "truncate_episodes"
-        config["rollout_fragment_length"] = 10
-        config["min_sample_timesteps_per_reporting"] = 20
+        # Initial lr, doesn't really matter because of the schedule below.
+        config = (
+            ppo.appo.APPOConfig()
+            .rollouts(
+                num_rollout_workers=1,
+                batch_mode="truncate_episodes",
+                rollout_fragment_length=10,
+            )
+            .resources(num_gpus=0)
+            .training(
+                train_batch_size=20,
+                entropy_coeff=0.01,
+                entropy_coeff_schedule=[
+                    [0, 0.01],
+                    [120, 0.0001],
+                ],
+            )
+        )
+
+        config.min_sample_timesteps_per_reporting = 20
         # 0 metrics reporting delay, this makes sure timestep,
         # which entropy coeff depends on, is updated after each worker rollout.
-        config["min_time_s_per_reporting"] = 0
-        # Initial lr, doesn't really matter because of the schedule below.
-        config["entropy_coeff"] = 0.01
-        schedule = [
-            [0, 0.01],
-            [120, 0.0001],
-        ]
-        config["entropy_coeff_schedule"] = schedule
+        config.min_time_s_per_reporting = 0
 
         def _step_n_times(trainer, n: int):
             """Step trainer n times.
@@ -121,7 +126,7 @@ def _step_n_times(trainer, n: int):
             ]
 
         for _ in framework_iterator(config):
-            trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
+            trainer = config.build(env="CartPole-v0")
 
             coeff = _step_n_times(trainer, 1)  # 20 timesteps
             # Should be close to the starting coeff of 0.01.