From d91ff3a4a70c96c34ee4aa6b9c23a07f20431c86 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 1 May 2022 21:15:39 +0200
Subject: [PATCH 1/5] wip

---
 rllib/agents/impala/__init__.py          |   5 +-
 rllib/agents/impala/impala.py            | 384 ++++++++++++++++-------
 rllib/agents/impala/tests/test_impala.py |  47 +--
 3 files changed, 308 insertions(+), 128 deletions(-)

diff --git a/rllib/agents/impala/__init__.py b/rllib/agents/impala/__init__.py
index ed24770f6f88..07c4f39abc9a 100644
--- a/rllib/agents/impala/__init__.py
+++ b/rllib/agents/impala/__init__.py
@@ -1,6 +1,7 @@
-from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaTrainer
+from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaConfig, ImpalaTrainer
 
 __all__ = [
-    "DEFAULT_CONFIG",
+    "ImpalaConfig",
     "ImpalaTrainer",
+    "DEFAULT_CONFIG",
 ]
diff --git a/rllib/agents/impala/impala.py b/rllib/agents/impala/impala.py
index c431d1ef4114..a1af09ad3275 100644
--- a/rllib/agents/impala/impala.py
+++ b/rllib/agents/impala/impala.py
@@ -1,9 +1,9 @@
 import logging
-from typing import Optional, Type
+from typing import Callable, List, Optional, Type, Union
 
 import ray
 from ray.rllib.agents.impala.vtrace_tf_policy import VTraceTFPolicy
-from ray.rllib.agents.trainer import Trainer, with_common_config
+from ray.rllib.agents.trainer import Trainer, TrainerConfig
 from ray.rllib.execution.learner_thread import LearnerThread
 from ray.rllib.execution.multi_gpu_learner_thread import MultiGPULearnerThread
 from ray.rllib.execution.tree_agg import gather_experiences_tree_aggregation
@@ -19,115 +19,255 @@
 from ray.rllib.execution.metric_ops import StandardMetricsReporting
 from ray.rllib.policy.policy import Policy
 from ray.rllib.utils.annotations import override
-from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning
+from ray.rllib.utils.deprecation import (
+    Deprecated,
+    DEPRECATED_VALUE,
+    deprecation_warning,
+)
 from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict
 from ray.tune.utils.placement_groups import PlacementGroupFactory
 
 logger = logging.getLogger(__name__)
 
-# fmt: off
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # V-trace params (see vtrace_tf/torch.py).
-    "vtrace": True,
-    "vtrace_clip_rho_threshold": 1.0,
-    "vtrace_clip_pg_rho_threshold": 1.0,
-    # If True, drop the last timestep for the vtrace calculations, such that
-    # all data goes into the calculations as [B x T-1] (+ the bootstrap value).
-    # This is the default and legacy RLlib behavior, however, could potentially
-    # have a destabilizing effect on learning, especially in sparse reward
-    # or reward-at-goal environments.
-    # False for not dropping the last timestep.
-    "vtrace_drop_last_ts": True,
-    # System params.
-    #
-    # == Overview of data flow in IMPALA ==
-    # 1. Policy evaluation in parallel across `num_workers` actors produces
-    #    batches of size `rollout_fragment_length * num_envs_per_worker`.
-    # 2. If enabled, the replay buffer stores and produces batches of size
-    #    `rollout_fragment_length * num_envs_per_worker`.
-    # 3. If enabled, the minibatch ring buffer stores and replays batches of
-    #    size `train_batch_size` up to `num_sgd_iter` times per batch.
-    # 4. The learner thread executes data parallel SGD across `num_gpus` GPUs
-    #    on batches of size `train_batch_size`.
-    #
-    "rollout_fragment_length": 50,
-    "train_batch_size": 500,
-    "min_time_s_per_reporting": 10,
-    "num_workers": 2,
-    # Number of GPUs the learner should use.
-    "num_gpus": 1,
-    # For each stack of multi-GPU towers, how many slots should we reserve for
-    # parallel data loading? Set this to >1 to load data into GPUs in
-    # parallel. This will increase GPU memory usage proportionally with the
-    # number of stacks.
-    # Example:
-    # 2 GPUs and `num_multi_gpu_tower_stacks=3`:
-    # - One tower stack consists of 2 GPUs, each with a copy of the
-    #   model/graph.
-    # - Each of the stacks will create 3 slots for batch data on each of its
-    #   GPUs, increasing memory requirements on each GPU by 3x.
-    # - This enables us to preload data into these stacks while another stack
-    #   is performing gradient calculations.
-    "num_multi_gpu_tower_stacks": 1,
-    # How many train batches should be retained for minibatching. This conf
-    # only has an effect if `num_sgd_iter > 1`.
-    "minibatch_buffer_size": 1,
-    # Number of passes to make over each train batch.
-    "num_sgd_iter": 1,
-    # Set >0 to enable experience replay. Saved samples will be replayed with
-    # a p:1 proportion to new data samples.
-    "replay_proportion": 0.0,
-    # Number of sample batches to store for replay. The number of transitions
-    # saved total will be (replay_buffer_num_slots * rollout_fragment_length).
-    "replay_buffer_num_slots": 0,
-    # Max queue size for train batches feeding into the learner.
-    "learner_queue_size": 16,
-    # Wait for train batches to be available in minibatch buffer queue
-    # this many seconds. This may need to be increased e.g. when training
-    # with a slow environment.
-    "learner_queue_timeout": 300,
-    # Level of queuing for sampling.
-    "max_sample_requests_in_flight_per_worker": 2,
-    # Max number of workers to broadcast one set of weights to.
-    "broadcast_interval": 1,
-    # Use n (`num_aggregation_workers`) extra Actors for multi-level
-    # aggregation of the data produced by the m RolloutWorkers
-    # (`num_workers`). Note that n should be much smaller than m.
-    # This can make sense if ingesting >2GB/s of samples, or if
-    # the data requires decompression.
-    "num_aggregation_workers": 0,
-
-    # Learning params.
-    "grad_clip": 40.0,
-    # Either "adam" or "rmsprop".
-    "opt_type": "adam",
-    "lr": 0.0005,
-    "lr_schedule": None,
-    # `opt_type=rmsprop` settings.
-    "decay": 0.99,
-    "momentum": 0.0,
-    "epsilon": 0.1,
-    # Balancing the three losses.
-    "vf_loss_coeff": 0.5,
-    "entropy_coeff": 0.01,
-    "entropy_coeff_schedule": None,
-    # Set this to true to have two separate optimizers optimize the policy-
-    # and value networks.
-    "_separate_vf_optimizer": False,
-    # If _separate_vf_optimizer is True, define separate learning rate
-    # for the value network.
-    "_lr_vf": 0.0005,
-
-    # Callback for APPO to use to update KL, target network periodically.
-    # The input to the callback is the learner fetches dict.
-    "after_train_step": None,
-
-    # DEPRECATED:
-    "num_data_loader_buffers": DEPRECATED_VALUE,
-})
-# __sphinx_doc_end__
-# fmt: on
+
+class ImpalaConfig(TrainerConfig):
+    """Defines an ARSTrainer configuration class from which an ImpalaTrainer can be built.
+
+    Example:
+        >>> config = ImpalaConfig().training(lr=0.0003, train_batch_size=512)\
+        ...     .resources(num_gpus=4)\
+        ...     .rollouts(num_rollout_workers=64)
+        >>> print(config.to_dict())
+        >>> # Build a Trainer object from the config and run 1 training iteration.
+        >>> trainer = config.build(env="CartPole-v1")
+        >>> trainer.train()
+
+    Example:
+        >>> from ray import tune
+        >>> config = ImpalaConfig()
+        >>> # Print out some default values.
+        >>> print(config.vtrace)
+        >>> # Update the config object.
+        >>> config.training(lr=tune.grid_search([0.0001, 0.0003]), grad_clip=20.0)
+        >>> # Set the config object's env.
+        >>> config.environment(env="CartPole-v1")
+        >>> # Use to_dict() to get the old-style python config dict
+        >>> # when running with tune.
+        >>> tune.run(
+        ...     "IMPALA",
+        ...     stop={"episode_reward_mean": 200},
+        ...     config=config.to_dict(),
+        ... )
+    """
+
+    def __init__(self):
+        """Initializes a ImpalaConfig instance."""
+        super().__init__(trainer_class=ImpalaTrainer)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+
+        # IMPALA specific settings:
+        self.vtrace = True
+        self.vtrace_clip_rho_threshold = 1.0
+        self.vtrace_clip_pg_rho_threshold = 1.0
+        self.vtrace_drop_last_ts = True
+        self.num_multi_gpu_tower_stacks = 1
+        self.minibatch_buffer_size = 1
+        self.num_sgd_iter = 1
+        self.replay_proportion = 0.0
+        self.replay_buffer_num_slots = 0
+        self.learner_queue_size = 16
+        self.learner_queue_timeout = 300
+        self.max_sample_requests_in_flight_per_worker = 2
+        self.broadcast_interval = 1
+        self.num_aggregation_workers = 0
+        self.grad_clip = 40.0
+        self.opt_type = "adam"
+        self.lr_schedule = None
+        self.decay = 0.99
+        self.momentum = 0.0
+        self.epsilon = 0.1
+        self.vf_loss_coeff = 0.5
+        self.entropy_coeff = 0.01
+        self.entropy_coeff_schedule = None
+        self._separate_vf_optimizer = False
+        self._lr_vf = 0.0005
+        self.after_train_step = None
+
+        # Override some of TrainerConfig's default values with ARS-specific values.
+        self.rollout_fragment_length = 50
+        self.train_batch_size = 500
+        self.num_workers = 2
+        self.num_gpus = 1
+        self.lr = 0.0005
+        self.min_time_s_per_reporting = 10
+        # __sphinx_doc_end__
+        # fmt: on
+
+        # Deprecated value.
+        self.num_data_loader_buffers = DEPRECATED_VALUE
+
+    @override(TrainerConfig)
+    def training(
+        self,
+        *,
+        vtrace: Optional[bool] = None,
+        vtrace_clip_rho_threshold: Optional[float] = None,
+        vtrace_clip_pg_rho_threshold: Optional[float] = None,
+        vtrace_drop_last_ts: Optional[bool] = None,
+        num_multi_gpu_tower_stacks: Optional[int] = None,
+        minibatch_buffer_size: Optional[int] = None,
+        num_sgd_iter: Optional[int] = None,
+        replay_proportion: Optional[float] = None,
+        replay_buffer_num_slots: Optional[int] = None,
+        learner_queue_size: Optional[int] = None,
+        learner_queue_timeout: Optional[float] = None,
+        max_sample_requests_in_flight_per_worker: Optional[int] = None,
+        broadcast_interval: Optional[int] = None,
+        num_aggregation_workers: Optional[int] = None,
+        grad_clip: Optional[float] = None,
+        opt_type: Optional[str] = None,
+        lr_schedule: Optional[List[List[Union[int, float]]]] = None,
+        decay: Optional[float] = None,
+        momentum: Optional[float] = None,
+        epsilon: Optional[float] = None,
+        vf_loss_coeff: Optional[float] = None,
+        entropy_coeff: Optional[float] = None,
+        entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None,
+        _separate_vf_optimizer: Optional[bool] = None,
+        _lr_vf: Optional[float] = None,
+        after_train_step: Optional[Callable[[dict], None]] = None,
+        **kwargs,
+    ) -> "ImpalaConfig":
+        """Sets the training related configuration.
+
+        Args:
+            vtrace: V-trace params (see vtrace_tf/torch.py).
+            vtrace_clip_rho_threshold:
+            vtrace_clip_pg_rho_threshold:
+            vtrace_drop_last_ts: If True, drop the last timestep for the vtrace
+                calculations, such that all data goes into the calculations as [B x T-1]
+                (+ the bootstrap value). This is the default and legacy RLlib behavior,
+                however, could potentially have a destabilizing effect on learning,
+                especially in sparse reward or reward-at-goal environments.
+                False for not dropping the last timestep.
+                System params.
+            num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many
+                slots should we reserve for parallel data loading? Set this to >1 to
+                load data into GPUs in parallel. This will increase GPU memory usage
+                proportionally with the number of stacks.
+                Example:
+                2 GPUs and `num_multi_gpu_tower_stacks=3`:
+                - One tower stack consists of 2 GPUs, each with a copy of the
+                model/graph.
+                - Each of the stacks will create 3 slots for batch data on each of its
+                GPUs, increasing memory requirements on each GPU by 3x.
+                - This enables us to preload data into these stacks while another stack
+                is performing gradient calculations.
+            minibatch_buffer_size: How many train batches should be retained for
+                minibatching. This conf only has an effect if `num_sgd_iter > 1`.
+            num_sgd_iter: Number of passes to make over each train batch.
+            replay_proportion: Set >0 to enable experience replay. Saved samples will
+                be replayed with a p:1 proportion to new data samples.
+            replay_buffer_num_slots: Number of sample batches to store for replay.
+                The number of transitions saved total will be
+                (replay_buffer_num_slots * rollout_fragment_length).
+            learner_queue_size: Max queue size for train batches feeding into the
+                learner.
+            learner_queue_timeout: Wait for train batches to be available in minibatch
+                buffer queue this many seconds. This may need to be increased e.g. when
+                training with a slow environment.
+            max_sample_requests_in_flight_per_worker: Level of queuing for sampling.
+            broadcast_interval: Max number of workers to broadcast one set of
+                weights to.
+
+            num_aggregation_workers: Use n (`num_aggregation_workers`) extra Actors for
+                multi-level aggregation of the data produced by the m RolloutWorkers
+                (`num_workers`). Note that n should be much smaller than m.
+                This can make sense if ingesting >2GB/s of samples, or if
+                the data requires decompression.
+            grad_clip:
+            opt_type: Either "adam" or "rmsprop".
+            lr_schedule:
+
+            decay: `opt_type=rmsprop` settings.
+            momentum:
+            epsilon:
+
+            vf_loss_coeff: Coefficient for the value function term in the loss function.
+            entropy_coeff: Coefficient for the entropy regularizer term in the loss
+                function.
+            entropy_coeff_schedule:
+            _separate_vf_optimizer: Set this to true to have two separate optimizers
+                optimize the policy-and value networks.
+            _lr_vf: If _separate_vf_optimizer is True, define separate learning rate
+                for the value network.
+            after_train_step: Callback for APPO to use to update KL, target network
+                periodically. The input to the callback is the learner fetches dict.
+
+        Returns:
+            This updated TrainerConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if vtrace is not None:
+            self.vtrace = vtrace
+        if vtrace_clip_rho_threshold is not None:
+            self.vtrace_clip_rho_threshold = vtrace_clip_rho_threshold
+        if vtrace_clip_pg_rho_threshold is not None:
+            self.vtrace_clip_pg_rho_threshold = vtrace_clip_pg_rho_threshold
+        if vtrace_drop_last_ts is not None:
+            self.vtrace_drop_last_ts = vtrace_drop_last_ts
+        if num_multi_gpu_tower_stacks is not None:
+            self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks
+        if minibatch_buffer_size is not None:
+            self.minibatch_buffer_size = minibatch_buffer_size
+        if num_sgd_iter is not None:
+            self.num_sgd_iter = num_sgd_iter
+        if replay_proportion is not None:
+            self.replay_proportion = replay_proportion
+        if replay_buffer_num_slots is not None:
+            self.replay_buffer_num_slots = replay_buffer_num_slots
+        if learner_queue_size is not None:
+            self.learner_queue_size = learner_queue_size
+        if learner_queue_timeout is not None:
+            self.learner_queue_timeout = learner_queue_timeout
+        if max_sample_requests_in_flight_per_worker is not None:
+            self.max_sample_requests_in_flight_per_worker = (
+                max_sample_requests_in_flight_per_worker
+            )
+        if broadcast_interval is not None:
+            self.broadcast_interval = broadcast_interval
+        if num_aggregation_workers is not None:
+            self.num_aggregation_workers = num_aggregation_workers
+        if grad_clip is not None:
+            self.grad_clip = grad_clip
+        if opt_type is not None:
+            self.opt_type = opt_type
+        if lr_schedule is not None:
+            self.lr_schedule = lr_schedule
+        if decay is not None:
+            self.decay = decay
+        if momentum is not None:
+            self.momentum = momentum
+        if epsilon is not None:
+            self.epsilon = epsilon
+        if vf_loss_coeff is not None:
+            self.vf_loss_coeff = vf_loss_coeff
+        if entropy_coeff is not None:
+            self.entropy_coeff = entropy_coeff
+        if entropy_coeff_schedule is not None:
+            self.entropy_coeff_schedule = entropy_coeff_schedule
+        if _separate_vf_optimizer is not None:
+            self._separate_vf_optimizer = _separate_vf_optimizer
+        if _lr_vf is not None:
+            self._lr_vf = _lr_vf
+        if after_train_step is not None:
+            self.after_train_step = after_train_step
+
+        return self
 
 
 def make_learner_thread(local_worker, config):
@@ -227,10 +367,23 @@ def __call__(self, item):
 
 
 class ImpalaTrainer(Trainer):
+    """Importance weighted actor/learner architecture (IMPALA) Trainer
+
+    == Overview of data flow in IMPALA ==
+    1. Policy evaluation in parallel across `num_workers` actors produces
+       batches of size `rollout_fragment_length * num_envs_per_worker`.
+    2. If enabled, the replay buffer stores and produces batches of size
+       `rollout_fragment_length * num_envs_per_worker`.
+    3. If enabled, the minibatch ring buffer stores and replays batches of
+       size `train_batch_size` up to `num_sgd_iter` times per batch.
+    4. The learner thread executes data parallel SGD across `num_gpus` GPUs
+       on batches of size `train_batch_size`.
+    """
+
     @classmethod
     @override(Trainer)
     def get_default_config(cls) -> TrainerConfigDict:
-        return DEFAULT_CONFIG
+        return ImpalaConfig().to_dict()
 
     @override(Trainer)
     def get_default_policy_class(
@@ -412,3 +565,20 @@ def default_resource_request(cls, config):
             ),
             strategy=config.get("placement_strategy", "PACK"),
         )
+
+
+# Deprecated: Use ray.rllib.agents.pg.PGConfig instead!
+class _deprecated_default_config(dict):
+    def __init__(self):
+        super().__init__(ImpalaConfig().to_dict())
+
+    @Deprecated(
+        old="ray.rllib.agents.impala.default_config::DEFAULT_CONFIG",
+        new="ray.rllib.agents.impala.impala.IMPALAConfig(...)",
+        error=False,
+    )
+    def __getitem__(self, item):
+        return super().__getitem__(item)
+
+
+DEFAULT_CONFIG = _deprecated_default_config()
diff --git a/rllib/agents/impala/tests/test_impala.py b/rllib/agents/impala/tests/test_impala.py
index 4b9790fa7cb5..6f3c18b2964a 100644
--- a/rllib/agents/impala/tests/test_impala.py
+++ b/rllib/agents/impala/tests/test_impala.py
@@ -26,26 +26,32 @@ def tearDownClass(cls) -> None:
 
     def test_impala_compilation(self):
         """Test whether an ImpalaTrainer can be built with both frameworks."""
-        config = impala.DEFAULT_CONFIG.copy()
-        config["num_gpus"] = 0
-        config["model"]["lstm_use_prev_action"] = True
-        config["model"]["lstm_use_prev_reward"] = True
+        config = (
+            impala.ImpalaConfig()
+            .resources(num_gpus=0)
+            .training(
+                model={
+                    "lstm_use_prev_action": True,
+                    "lstm_use_prev_reward": True,
+                }
+            )
+        )
+
         num_iterations = 1
         env = "CartPole-v0"
 
         for _ in framework_iterator(config, with_eager_tracing=True):
-            local_cfg = config.copy()
             for lstm in [False, True]:
-                local_cfg["num_aggregation_workers"] = 0 if not lstm else 1
-                local_cfg["model"]["use_lstm"] = lstm
+                config.num_aggregation_workers = 0 if not lstm else 1
+                config.model["use_lstm"] = lstm
                 print(
                     "lstm={} aggregation-workers={}".format(
-                        lstm, local_cfg["num_aggregation_workers"]
+                        lstm, config.num_aggregation_workers
                     )
                 )
                 # Test with and w/o aggregation workers (this has nothing
                 # to do with LSTMs, though).
-                trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
+                trainer = config.build(env=env)
                 for i in range(num_iterations):
                     results = trainer.train()
                     check_train_results(results)
@@ -59,17 +65,20 @@ def test_impala_compilation(self):
                 trainer.stop()
 
     def test_impala_lr_schedule(self):
-        config = impala.DEFAULT_CONFIG.copy()
-        config["num_gpus"] = 0
         # Test whether we correctly ignore the "lr" setting.
         # The first lr should be 0.05.
-        config["lr"] = 0.1
-        config["lr_schedule"] = [
-            [0, 0.05],
-            [10000, 0.000001],
-        ]
-        config["num_gpus"] = 0  # Do not use any (fake) GPUs.
-        config["env"] = "CartPole-v0"
+        config = (
+            impala.ImpalaConfig()
+            .resources(num_gpus=0)
+            .training(
+                lr=0.1,
+                lr_schedule=[
+                    [0, 0.05],
+                    [10000, 0.000001],
+                ],
+            )
+        )
+        config.environment(env="CartPole-v0")
 
         def get_lr(result):
             return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][
@@ -77,7 +86,7 @@ def get_lr(result):
             ]
 
         for fw in framework_iterator(config):
-            trainer = impala.ImpalaTrainer(config=config)
+            trainer = config.build()
             policy = trainer.get_policy()
 
             try:

From da26714a5d408416a80c8bae37ad1ae11eecf203 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 1 May 2022 22:14:52 +0200
Subject: [PATCH 2/5] wip

---
 rllib/__init__.py                   |   2 +-
 rllib/agents/impala/impala.py       |   4 +-
 rllib/agents/ppo/__init__.py        |   3 +-
 rllib/agents/ppo/appo.py            | 210 ++++++++++++++++++++--------
 rllib/agents/ppo/tests/test_appo.py |  60 ++++----
 5 files changed, 178 insertions(+), 101 deletions(-)

diff --git a/rllib/__init__.py b/rllib/__init__.py
index a8867c09bf1d..1ac2ece81b4e 100644
--- a/rllib/__init__.py
+++ b/rllib/__init__.py
@@ -57,7 +57,7 @@ def setup(self, config):
 
 _setup_logger()
 
-usage_lib.record_library_usage("rllib")
+# usage_lib.record_library_usage("rllib")
 
 __all__ = [
     "Policy",
diff --git a/rllib/agents/impala/impala.py b/rllib/agents/impala/impala.py
index a1af09ad3275..6e500714b021 100644
--- a/rllib/agents/impala/impala.py
+++ b/rllib/agents/impala/impala.py
@@ -60,9 +60,9 @@ class ImpalaConfig(TrainerConfig):
         ... )
     """
 
-    def __init__(self):
+    def __init__(self, trainer_class=None):
         """Initializes a ImpalaConfig instance."""
-        super().__init__(trainer_class=ImpalaTrainer)
+        super().__init__(trainer_class=trainer_class or ImpalaTrainer)
 
         # fmt: off
         # __sphinx_doc_begin__
diff --git a/rllib/agents/ppo/__init__.py b/rllib/agents/ppo/__init__.py
index dca9f385fde0..3a8c0a20e486 100644
--- a/rllib/agents/ppo/__init__.py
+++ b/rllib/agents/ppo/__init__.py
@@ -1,10 +1,11 @@
 from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
 from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
-from ray.rllib.agents.ppo.appo import APPOTrainer
+from ray.rllib.agents.ppo.appo import APPOConfig, APPOTrainer
 from ray.rllib.agents.ppo.ddppo import DDPPOTrainer
 
 __all__ = [
+    "APPOConfig",
     "APPOTrainer",
     "DDPPOTrainer",
     "DEFAULT_CONFIG",
diff --git a/rllib/agents/ppo/appo.py b/rllib/agents/ppo/appo.py
index 2ef0f9a88e26..c901c6d40b6d 100644
--- a/rllib/agents/ppo/appo.py
+++ b/rllib/agents/ppo/appo.py
@@ -23,69 +23,140 @@
     _get_shared_metrics,
 )
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.deprecation import Deprecated
 from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict
 
-# fmt: off
-# __sphinx_doc_begin__
-
-# Adds the following updates to the `IMPALATrainer` config in
-# rllib/agents/impala/impala.py.
-DEFAULT_CONFIG = impala.ImpalaTrainer.merge_trainer_configs(
-    impala.DEFAULT_CONFIG,  # See keys in impala.py, which are also supported.
-    {
-        # Whether to use V-trace weighted advantages. If false, PPO GAE
-        # advantages will be used instead.
-        "vtrace": True,
-
-        # == These two options only apply if vtrace: False ==
-        # Should use a critic as a baseline (otherwise don't use value
-        # baseline; required for using GAE).
-        "use_critic": True,
-        # If true, use the Generalized Advantage Estimator (GAE)
-        # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
-        "use_gae": True,
-        # GAE(lambda) parameter
-        "lambda": 1.0,
-
-        # == PPO surrogate loss options ==
-        "clip_param": 0.4,
-
-        # == PPO KL Loss options ==
-        "use_kl_loss": False,
-        "kl_coeff": 1.0,
-        "kl_target": 0.01,
-
-        # == IMPALA optimizer params (see documentation in impala.py) ==
-        "rollout_fragment_length": 50,
-        "train_batch_size": 500,
-        "min_time_s_per_reporting": 10,
-        "num_workers": 2,
-        "num_gpus": 0,
-        "num_multi_gpu_tower_stacks": 1,
-        "minibatch_buffer_size": 1,
-        "num_sgd_iter": 1,
-        "replay_proportion": 0.0,
-        "replay_buffer_num_slots": 100,
-        "learner_queue_size": 16,
-        "learner_queue_timeout": 300,
-        "max_sample_requests_in_flight_per_worker": 2,
-        "broadcast_interval": 1,
-        "grad_clip": 40.0,
-        "opt_type": "adam",
-        "lr": 0.0005,
-        "lr_schedule": None,
-        "decay": 0.99,
-        "momentum": 0.0,
-        "epsilon": 0.1,
-        "vf_loss_coeff": 0.5,
-        "entropy_coeff": 0.01,
-        "entropy_coeff_schedule": None,
-    },
-    _allow_unknown_configs=True,
-)
 
-# __sphinx_doc_end__
-# fmt: on
+class APPOConfig(impala.ImpalaConfig):
+    """Defines a A2CTrainer configuration class from which a new Trainer can be built.
+
+    Example:
+        >>> from ray import tune
+        >>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\
+        ...     .resources(num_gpus=1)\
+        ...     .rollouts(num_rollout_workers=16)
+        >>> print(config.to_dict())
+        >>> # Build a Trainer object from the config and run 1 training iteration.
+        >>> trainer = config.build(env="CartPole-v1")
+        >>> trainer.train()
+
+    Example:
+        >>> config = APPOConfig()
+        >>> # Print out some default values.
+        >>> print(config.sample_async)
+        >>> # Update the config object.
+        >>> config.training(lr=tune.grid_search([0.001, 0.0001]))
+        >>> # Set the config object's env.
+        >>> config.environment(env="CartPole-v1")
+        >>> # Use to_dict() to get the old-style python config dict
+        >>> # when running with tune.
+        >>> tune.run(
+        ...     "APPO",
+        ...     stop={"episode_reward_mean": 200},
+        ...     config=config.to_dict(),
+        ... )
+    """
+
+    def __init__(self, trainer_class=None):
+        """Initializes a APPOConfig instance."""
+        super().__init__(trainer_class=trainer_class or APPOTrainer)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+
+        # APPO specific settings:
+        self.vtrace = True
+        self.use_critic = True
+        self.use_gae = True
+        self.lambda_ = 1.0
+        self.clip_param = 0.4
+        self.use_kl_loss = False
+        self.kl_coeff = 1.0
+        self.kl_target = 0.01
+
+        # Override some of ImpalaConfig's default values with APPO-specific values.
+        self.rollout_fragment_length = 50
+        self.train_batch_size = 500
+        self.min_time_s_per_reporting = 10
+        self.num_workers = 2
+        self.num_gpus = 0
+        self.num_multi_gpu_tower_stacks = 1
+        self.minibatch_buffer_size = 1
+        self.num_sgd_iter = 1
+        self.replay_proportion = 0.0
+        self.replay_buffer_num_slots = 100
+        self.learner_queue_size = 16
+        self.learner_queue_timeout = 300
+        self.max_sample_requests_in_flight_per_worker = 2
+        self.broadcast_interval = 1
+        self.grad_clip = 40.0
+        self.opt_type = "adam"
+        self.lr = 0.0005
+        self.lr_schedule = None
+        self.decay = 0.99
+        self.momentum = 0.0
+        self.epsilon = 0.1
+        self.vf_loss_coeff = 0.5
+        self.entropy_coeff = 0.01
+        self.entropy_coeff_schedule = None
+        # __sphinx_doc_end__
+        # fmt: on
+
+    @override(impala.ImpalaConfig)
+    def training(
+        self,
+        *,
+        vtrace: Optional[bool] = None,
+        use_critic: Optional[bool] = None,
+        use_gae: Optional[bool] = None,
+        lambda_: Optional[float] = None,
+        clip_param: Optional[float] = None,
+        use_kl_loss: Optional[bool] = None,
+        kl_coeff: Optional[float] = None,
+        kl_target: Optional[float] = None,
+        **kwargs,
+    ) -> "APPOConfig":
+        """Sets the training related configuration.
+
+        Args:
+            vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
+                advantages will be used instead.
+            use_critic: Should use a critic as a baseline (otherwise don't use value
+                baseline; required for using GAE). Only applies if vtrace=False.
+            use_gae: If true, use the Generalized Advantage Estimator (GAE)
+                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
+                Only applies if vtrace=False.
+            lambda_: GAE (lambda) parameter.
+            clip_param: PPO surrogate slipping parameter.
+            use_kl_loss: Whether to use the KL-term in the loss function.
+            kl_coeff: Coefficient for weighting the KL-loss term.
+            kl_target: Target term for the KL-term to reach (via adjusting the
+                `kl_coeff` automatically).
+
+        Returns:
+            This updated TrainerConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if vtrace is not None:
+            self.vtrace = vtrace
+        if use_critic is not None:
+            self.use_critic = use_critic
+        if use_gae is not None:
+            self.use_gae = use_gae
+        if lambda_ is not None:
+            self.lambda_ = lambda_
+        if clip_param is not None:
+            self.clip_param = clip_param
+        if use_kl_loss is not None:
+            self.use_kl_loss = use_kl_loss
+        if kl_coeff is not None:
+            self.kl_coeff = kl_coeff
+        if kl_target is not None:
+            self.kl_target = kl_target
+
+        return self
 
 
 class UpdateTargetAndKL:
@@ -130,7 +201,7 @@ def __init__(self, config, *args, **kwargs):
     @classmethod
     @override(Trainer)
     def get_default_config(cls) -> TrainerConfigDict:
-        return DEFAULT_CONFIG
+        return APPOConfig().to_dict()
 
     @override(Trainer)
     def get_default_policy_class(
@@ -142,3 +213,20 @@ def get_default_policy_class(
             return AsyncPPOTorchPolicy
         else:
             return AsyncPPOTFPolicy
+
+
+# Deprecated: Use ray.rllib.agents.a3c.A3CConfig instead!
+class _deprecated_default_config(dict):
+    def __init__(self):
+        super().__init__(APPOConfig().to_dict())
+
+    @Deprecated(
+        old="ray.rllib.agents.ppo.appo.DEFAULT_CONFIG",
+        new="ray.rllib.agents.ppo.appo.APPOConfig(...)",
+        error=False,
+    )
+    def __getitem__(self, item):
+        return super().__getitem__(item)
+
+
+DEFAULT_CONFIG = _deprecated_default_config()
diff --git a/rllib/agents/ppo/tests/test_appo.py b/rllib/agents/ppo/tests/test_appo.py
index 551a45b5788d..f96970a1e16d 100644
--- a/rllib/agents/ppo/tests/test_appo.py
+++ b/rllib/agents/ppo/tests/test_appo.py
@@ -22,15 +22,13 @@ def tearDownClass(cls):
 
     def test_appo_compilation(self):
         """Test whether an APPOTrainer can be built with both frameworks."""
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
+        config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1)
         num_iterations = 2
 
         for _ in framework_iterator(config, with_eager_tracing=True):
             print("w/o v-trace")
-            _config = config.copy()
-            _config["vtrace"] = False
-            trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0")
+            config.vtrace = False
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -39,9 +37,8 @@ def test_appo_compilation(self):
             trainer.stop()
 
             print("w/ v-trace")
-            _config = config.copy()
-            _config["vtrace"] = True
-            trainer = ppo.APPOTrainer(config=_config, env="CartPole-v0")
+            config.vtrace = True
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -51,13 +48,11 @@ def test_appo_compilation(self):
 
     def test_appo_compilation_use_kl_loss(self):
         """Test whether an APPOTrainer can be built with kl_loss enabled."""
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-        config["use_kl_loss"] = True
+        config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training(use_kl_loss=True)
         num_iterations = 2
 
         for _ in framework_iterator(config, with_eager_tracing=True):
-            trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -66,22 +61,19 @@ def test_appo_compilation_use_kl_loss(self):
             trainer.stop()
 
     def test_appo_two_tf_optimizers(self):
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-
         # Not explicitly setting this should cause a warning, but not fail.
         # config["_tf_policy_handles_more_than_one_loss"] = True
-        config["_separate_vf_optimizer"] = True
-        config["_lr_vf"] = 0.0002
-
+        config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training(
+            _separate_vf_optimizer=True, _lr_vf=0.002)
         # Make sure we have two completely separate models for policy and
         # value function.
-        config["model"]["vf_share_layers"] = False
+        config.model["vf_share_layers"] = False
+
         num_iterations = 2
 
         # Only supported for tf so far.
         for _ in framework_iterator(config, frameworks=("tf2", "tf")):
-            trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
+            trainer = config.build(env="CartPole-v0")
             for i in range(num_iterations):
                 results = trainer.train()
                 check_train_results(results)
@@ -90,23 +82,19 @@ def test_appo_two_tf_optimizers(self):
             trainer.stop()
 
     def test_appo_entropy_coeff_schedule(self):
-        config = ppo.appo.DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-        config["num_gpus"] = 0
-        config["train_batch_size"] = 20
-        config["batch_mode"] = "truncate_episodes"
-        config["rollout_fragment_length"] = 10
-        config["timesteps_per_iteration"] = 20
+        # Initial lr, doesn't really matter because of the schedule below.
+        config = ppo.appo.APPOConfig().\
+            rollouts(num_rollout_workers=1, batch_mode="truncate_episodes", rollout_fragment_length=10).\
+            resources(num_gpus=0).\
+            training(train_batch_size=20, entropy_coeff=0.01, entropy_coeff_schedule=[
+                [0, 0.01],
+                [120, 0.0001],
+            ])
+
+        config.min_sample_timesteps_per_reporting = 20
         # 0 metrics reporting delay, this makes sure timestep,
         # which entropy coeff depends on, is updated after each worker rollout.
-        config["min_time_s_per_reporting"] = 0
-        # Initial lr, doesn't really matter because of the schedule below.
-        config["entropy_coeff"] = 0.01
-        schedule = [
-            [0, 0.01],
-            [120, 0.0001],
-        ]
-        config["entropy_coeff_schedule"] = schedule
+        config.min_time_s_per_reporting = 0
 
         def _step_n_times(trainer, n: int):
             """Step trainer n times.
@@ -121,7 +109,7 @@ def _step_n_times(trainer, n: int):
             ]
 
         for _ in framework_iterator(config):
-            trainer = ppo.APPOTrainer(config=config, env="CartPole-v0")
+            trainer = config.build(env="CartPole-v0")
 
             coeff = _step_n_times(trainer, 1)  # 20 timesteps
             # Should be close to the starting coeff of 0.01.

From 21bf876640cc9d2c52e740cb959bce12444269a0 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 2 May 2022 10:25:23 +0200
Subject: [PATCH 3/5] wip

---
 rllib/agents/impala/impala.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rllib/agents/impala/impala.py b/rllib/agents/impala/impala.py
index a1af09ad3275..5c0072ac29d1 100644
--- a/rllib/agents/impala/impala.py
+++ b/rllib/agents/impala/impala.py
@@ -34,6 +34,7 @@ class ImpalaConfig(TrainerConfig):
     """Defines an ARSTrainer configuration class from which an ImpalaTrainer can be built.
 
     Example:
+        >>> from ray.rllib.agents.impala import ImpalaConfig
         >>> config = ImpalaConfig().training(lr=0.0003, train_batch_size=512)\
         ...     .resources(num_gpus=4)\
         ...     .rollouts(num_rollout_workers=64)
@@ -43,6 +44,7 @@ class ImpalaConfig(TrainerConfig):
         >>> trainer.train()
 
     Example:
+        >>> from ray.rllib.agents.impala import ImpalaConfig
         >>> from ray import tune
         >>> config = ImpalaConfig()
         >>> # Print out some default values.

From c629956fbfaba88319dd9ca31ee9f27cd72cd916 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 2 May 2022 10:45:21 +0200
Subject: [PATCH 4/5] wip

---
 rllib/agents/ppo/appo.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/rllib/agents/ppo/appo.py b/rllib/agents/ppo/appo.py
index c901c6d40b6d..eae0f065d2c4 100644
--- a/rllib/agents/ppo/appo.py
+++ b/rllib/agents/ppo/appo.py
@@ -28,10 +28,10 @@
 
 
 class APPOConfig(impala.ImpalaConfig):
-    """Defines a A2CTrainer configuration class from which a new Trainer can be built.
+    """Defines a APPOTrainer configuration class from which a new Trainer can be built.
 
     Example:
-        >>> from ray import tune
+        >>> from ray.rllib.agents.ppo import APPOConfig
         >>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\
         ...     .resources(num_gpus=1)\
         ...     .rollouts(num_rollout_workers=16)
@@ -41,6 +41,8 @@ class APPOConfig(impala.ImpalaConfig):
         >>> trainer.train()
 
     Example:
+        >>> from ray.rllib.agents.ppo import APPOConfig
+        >>> from ray import tune
         >>> config = APPOConfig()
         >>> # Print out some default values.
         >>> print(config.sample_async)
@@ -215,7 +217,7 @@ def get_default_policy_class(
             return AsyncPPOTFPolicy
 
 
-# Deprecated: Use ray.rllib.agents.a3c.A3CConfig instead!
+# Deprecated: Use ray.rllib.agents.ppo.APPOConfig instead!
 class _deprecated_default_config(dict):
     def __init__(self):
         super().__init__(APPOConfig().to_dict())

From 7b17862d919b3868268b03beebf2427550b8090f Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 2 May 2022 12:11:25 +0200
Subject: [PATCH 5/5] wip

---
 rllib/__init__.py                   |  2 +-
 rllib/agents/ppo/tests/test_appo.py | 37 +++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/rllib/__init__.py b/rllib/__init__.py
index 1ac2ece81b4e..a8867c09bf1d 100644
--- a/rllib/__init__.py
+++ b/rllib/__init__.py
@@ -57,7 +57,7 @@ def setup(self, config):
 
 _setup_logger()
 
-# usage_lib.record_library_usage("rllib")
+usage_lib.record_library_usage("rllib")
 
 __all__ = [
     "Policy",
diff --git a/rllib/agents/ppo/tests/test_appo.py b/rllib/agents/ppo/tests/test_appo.py
index f96970a1e16d..1ddabc9f13ab 100644
--- a/rllib/agents/ppo/tests/test_appo.py
+++ b/rllib/agents/ppo/tests/test_appo.py
@@ -48,7 +48,11 @@ def test_appo_compilation(self):
 
     def test_appo_compilation_use_kl_loss(self):
         """Test whether an APPOTrainer can be built with kl_loss enabled."""
-        config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training(use_kl_loss=True)
+        config = (
+            ppo.appo.APPOConfig()
+            .rollouts(num_rollout_workers=1)
+            .training(use_kl_loss=True)
+        )
         num_iterations = 2
 
         for _ in framework_iterator(config, with_eager_tracing=True):
@@ -63,8 +67,11 @@ def test_appo_compilation_use_kl_loss(self):
     def test_appo_two_tf_optimizers(self):
         # Not explicitly setting this should cause a warning, but not fail.
         # config["_tf_policy_handles_more_than_one_loss"] = True
-        config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1).training(
-            _separate_vf_optimizer=True, _lr_vf=0.002)
+        config = (
+            ppo.appo.APPOConfig()
+            .rollouts(num_rollout_workers=1)
+            .training(_separate_vf_optimizer=True, _lr_vf=0.002)
+        )
         # Make sure we have two completely separate models for policy and
         # value function.
         config.model["vf_share_layers"] = False
@@ -83,13 +90,23 @@ def test_appo_two_tf_optimizers(self):
 
     def test_appo_entropy_coeff_schedule(self):
         # Initial lr, doesn't really matter because of the schedule below.
-        config = ppo.appo.APPOConfig().\
-            rollouts(num_rollout_workers=1, batch_mode="truncate_episodes", rollout_fragment_length=10).\
-            resources(num_gpus=0).\
-            training(train_batch_size=20, entropy_coeff=0.01, entropy_coeff_schedule=[
-                [0, 0.01],
-                [120, 0.0001],
-            ])
+        config = (
+            ppo.appo.APPOConfig()
+            .rollouts(
+                num_rollout_workers=1,
+                batch_mode="truncate_episodes",
+                rollout_fragment_length=10,
+            )
+            .resources(num_gpus=0)
+            .training(
+                train_batch_size=20,
+                entropy_coeff=0.01,
+                entropy_coeff_schedule=[
+                    [0, 0.01],
+                    [120, 0.0001],
+                ],
+            )
+        )
 
         config.min_sample_timesteps_per_reporting = 20
         # 0 metrics reporting delay, this makes sure timestep,