ray-project · sven1977 · May 2, 2022 · May 1, 2022 · May 1, 2022 · May 2, 2022
diff --git a/rllib/__init__.py b/rllib/__init__.py
@@ -57,7 +57,7 @@ def setup(self, config):
 
 _setup_logger()
 
-usage_lib.record_library_usage("rllib")
+# usage_lib.record_library_usage("rllib")
 
 __all__ = [
     "Policy",

diff --git a/rllib/agents/impala/__init__.py b/rllib/agents/impala/__init__.py
@@ -1,6 +1,7 @@
-from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaTrainer
+from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaConfig, ImpalaTrainer
 
 __all__ = [
-    "DEFAULT_CONFIG",
+    "ImpalaConfig",
     "ImpalaTrainer",
+    "DEFAULT_CONFIG",
 ]
diff --git a/rllib/agents/impala/tests/test_impala.py b/rllib/agents/impala/tests/test_impala.py
@@ -26,26 +26,32 @@ def tearDownClass(cls) -> None:
 
     def test_impala_compilation(self):
         """Test whether an ImpalaTrainer can be built with both frameworks."""
-        config = impala.DEFAULT_CONFIG.copy()
-        config["num_gpus"] = 0
-        config["model"]["lstm_use_prev_action"] = True
-        config["model"]["lstm_use_prev_reward"] = True
+        config = (
+            impala.ImpalaConfig()
+            .resources(num_gpus=0)
+            .training(
+                model={
+                    "lstm_use_prev_action": True,
+                    "lstm_use_prev_reward": True,
+                }
+            )
+        )
+
         num_iterations = 1
         env = "CartPole-v0"
 
         for _ in framework_iterator(config, with_eager_tracing=True):
-            local_cfg = config.copy()
             for lstm in [False, True]:
-                local_cfg["num_aggregation_workers"] = 0 if not lstm else 1
-                local_cfg["model"]["use_lstm"] = lstm
+                config.num_aggregation_workers = 0 if not lstm else 1
+                config.model["use_lstm"] = lstm
                 print(
                     "lstm={} aggregation-workers={}".format(
-                        lstm, local_cfg["num_aggregation_workers"]
+                        lstm, config.num_aggregation_workers
                     )
                 )
                 # Test with and w/o aggregation workers (this has nothing
                 # to do with LSTMs, though).
-                trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
+                trainer = config.build(env=env)
                 for i in range(num_iterations):
                     results = trainer.train()
                     check_train_results(results)
@@ -59,25 +65,28 @@ def test_impala_compilation(self):
                 trainer.stop()
 
     def test_impala_lr_schedule(self):
-        config = impala.DEFAULT_CONFIG.copy()
-        config["num_gpus"] = 0
         # Test whether we correctly ignore the "lr" setting.
         # The first lr should be 0.05.
-        config["lr"] = 0.1
-        config["lr_schedule"] = [
-            [0, 0.05],
-            [10000, 0.000001],
-        ]
-        config["num_gpus"] = 0  # Do not use any (fake) GPUs.
-        config["env"] = "CartPole-v0"
+        config = (
+            impala.ImpalaConfig()
+            .resources(num_gpus=0)
+            .training(
+                lr=0.1,
+                lr_schedule=[
+                    [0, 0.05],
+                    [10000, 0.000001],
+                ],
+            )
+        )
+        config.environment(env="CartPole-v0")
 
         def get_lr(result):
             return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][
                 "cur_lr"
             ]
 
         for fw in framework_iterator(config):
-            trainer = impala.ImpalaTrainer(config=config)
+            trainer = config.build()
             policy = trainer.get_policy()
 
             try:

@@ -1,10 +1,11 @@
 from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
 from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
-from ray.rllib.agents.ppo.appo import APPOTrainer
+from ray.rllib.agents.ppo.appo import APPOConfig, APPOTrainer
 from ray.rllib.agents.ppo.ddppo import DDPPOTrainer
 
 __all__ = [
+    "APPOConfig",
     "APPOTrainer",
     "DDPPOTrainer",
     "DEFAULT_CONFIG",

@@ -23,69 +23,140 @@
     _get_shared_metrics,
 )
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.deprecation import Deprecated
 from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict
 
-# fmt: off
-# __sphinx_doc_begin__
-
-# Adds the following updates to the `IMPALATrainer` config in
-# rllib/agents/impala/impala.py.
-DEFAULT_CONFIG = impala.ImpalaTrainer.merge_trainer_configs(
-    impala.DEFAULT_CONFIG,  # See keys in impala.py, which are also supported.
-    {
-        # Whether to use V-trace weighted advantages. If false, PPO GAE
-        # advantages will be used instead.
-        "vtrace": True,
-
-        # == These two options only apply if vtrace: False ==
-        # Should use a critic as a baseline (otherwise don't use value
-        # baseline; required for using GAE).
-        "use_critic": True,
-        # If true, use the Generalized Advantage Estimator (GAE)
-        # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
-        "use_gae": True,
-        # GAE(lambda) parameter
-        "lambda": 1.0,
-
-        # == PPO surrogate loss options ==
-        "clip_param": 0.4,
-
-        # == PPO KL Loss options ==
-        "use_kl_loss": False,
-        "kl_coeff": 1.0,
-        "kl_target": 0.01,
-
-        # == IMPALA optimizer params (see documentation in impala.py) ==
-        "rollout_fragment_length": 50,
-        "train_batch_size": 500,
-        "min_time_s_per_reporting": 10,
-        "num_workers": 2,
-        "num_gpus": 0,
-        "num_multi_gpu_tower_stacks": 1,
-        "minibatch_buffer_size": 1,
-        "num_sgd_iter": 1,
-        "replay_proportion": 0.0,
-        "replay_buffer_num_slots": 100,
-        "learner_queue_size": 16,
-        "learner_queue_timeout": 300,
-        "max_sample_requests_in_flight_per_worker": 2,
-        "broadcast_interval": 1,
-        "grad_clip": 40.0,
-        "opt_type": "adam",
-        "lr": 0.0005,
-        "lr_schedule": None,
-        "decay": 0.99,
-        "momentum": 0.0,
-        "epsilon": 0.1,
-        "vf_loss_coeff": 0.5,
-        "entropy_coeff": 0.01,
-        "entropy_coeff_schedule": None,
-    },
-    _allow_unknown_configs=True,
-)
 
-# __sphinx_doc_end__
-# fmt: on
+class APPOConfig(impala.ImpalaConfig):
+    """Defines a A2CTrainer configuration class from which a new Trainer can be built.
+
+    Example:
+        >>> from ray import tune
+        >>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\
+        ...     .resources(num_gpus=1)\
+        ...     .rollouts(num_rollout_workers=16)
+        >>> print(config.to_dict())
+        >>> # Build a Trainer object from the config and run 1 training iteration.
+        >>> trainer = config.build(env="CartPole-v1")
+        >>> trainer.train()
+
+    Example:
+        >>> config = APPOConfig()
+        >>> # Print out some default values.
+        >>> print(config.sample_async)
+        >>> # Update the config object.
+        >>> config.training(lr=tune.grid_search([0.001, 0.0001]))
+        >>> # Set the config object's env.
+        >>> config.environment(env="CartPole-v1")
+        >>> # Use to_dict() to get the old-style python config dict
+        >>> # when running with tune.
+        >>> tune.run(
+        ...     "APPO",
+        ...     stop={"episode_reward_mean": 200},
+        ...     config=config.to_dict(),
+        ... )
+    """
+
+    def __init__(self, trainer_class=None):
+        """Initializes a APPOConfig instance."""
+        super().__init__(trainer_class=trainer_class or APPOTrainer)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+
+        # APPO specific settings:
+        self.vtrace = True
+        self.use_critic = True
+        self.use_gae = True
+        self.lambda_ = 1.0
+        self.clip_param = 0.4
+        self.use_kl_loss = False
+        self.kl_coeff = 1.0
+        self.kl_target = 0.01
+
+        # Override some of ImpalaConfig's default values with APPO-specific values.
+        self.rollout_fragment_length = 50
+        self.train_batch_size = 500
+        self.min_time_s_per_reporting = 10
+        self.num_workers = 2
+        self.num_gpus = 0
+        self.num_multi_gpu_tower_stacks = 1
+        self.minibatch_buffer_size = 1
+        self.num_sgd_iter = 1
+        self.replay_proportion = 0.0
+        self.replay_buffer_num_slots = 100
+        self.learner_queue_size = 16
+        self.learner_queue_timeout = 300
+        self.max_sample_requests_in_flight_per_worker = 2
+        self.broadcast_interval = 1
+        self.grad_clip = 40.0
+        self.opt_type = "adam"
+        self.lr = 0.0005
+        self.lr_schedule = None
+        self.decay = 0.99
+        self.momentum = 0.0
+        self.epsilon = 0.1
+        self.vf_loss_coeff = 0.5
+        self.entropy_coeff = 0.01
+        self.entropy_coeff_schedule = None
+        # __sphinx_doc_end__
+        # fmt: on
+
+    @override(impala.ImpalaConfig)
+    def training(
+        self,
+        *,
+        vtrace: Optional[bool] = None,
+        use_critic: Optional[bool] = None,
+        use_gae: Optional[bool] = None,
+        lambda_: Optional[float] = None,
+        clip_param: Optional[float] = None,
+        use_kl_loss: Optional[bool] = None,
+        kl_coeff: Optional[float] = None,
+        kl_target: Optional[float] = None,
+        **kwargs,
+    ) -> "APPOConfig":
+        """Sets the training related configuration.
+
+        Args:
+            vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
+                advantages will be used instead.
+            use_critic: Should use a critic as a baseline (otherwise don't use value
+                baseline; required for using GAE). Only applies if vtrace=False.
+            use_gae: If true, use the Generalized Advantage Estimator (GAE)
+                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
+                Only applies if vtrace=False.
+            lambda_: GAE (lambda) parameter.
+            clip_param: PPO surrogate slipping parameter.
+            use_kl_loss: Whether to use the KL-term in the loss function.
+            kl_coeff: Coefficient for weighting the KL-loss term.
+            kl_target: Target term for the KL-term to reach (via adjusting the
+                `kl_coeff` automatically).
+
+        Returns:
+            This updated TrainerConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if vtrace is not None:
+            self.vtrace = vtrace
+        if use_critic is not None:
+            self.use_critic = use_critic
+        if use_gae is not None:
+            self.use_gae = use_gae
+        if lambda_ is not None:
+            self.lambda_ = lambda_
+        if clip_param is not None:
+            self.clip_param = clip_param
+        if use_kl_loss is not None:
+            self.use_kl_loss = use_kl_loss
+        if kl_coeff is not None:
+            self.kl_coeff = kl_coeff
+        if kl_target is not None:
+            self.kl_target = kl_target
+
+        return self
 
 
 class UpdateTargetAndKL:
@@ -130,7 +201,7 @@ def __init__(self, config, *args, **kwargs):
     @classmethod
     @override(Trainer)
     def get_default_config(cls) -> TrainerConfigDict:
-        return DEFAULT_CONFIG
+        return APPOConfig().to_dict()
 
     @override(Trainer)
     def get_default_policy_class(
@@ -142,3 +213,20 @@ def get_default_policy_class(
             return AsyncPPOTorchPolicy
         else:
             return AsyncPPOTFPolicy
+
+
+# Deprecated: Use ray.rllib.agents.a3c.A3CConfig instead!
+class _deprecated_default_config(dict):
+    def __init__(self):
+        super().__init__(APPOConfig().to_dict())
+
+    @Deprecated(
+        old="ray.rllib.agents.ppo.appo.DEFAULT_CONFIG",
+        new="ray.rllib.agents.ppo.appo.APPOConfig(...)",
+        error=False,
+    )
+    def __getitem__(self, item):
+        return super().__getitem__(item)
+
+
+DEFAULT_CONFIG = _deprecated_default_config()