[RLlib] Make the KL coefficient traced in appo tf (#34293)

Signed-off-by: Avnish <[email protected]>
ray-project · Apr 13, 2023 · f1b14d2 · f1b14d2
1 parent 10d0b8a
commit f1b14d2
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 6 deletions.
diff --git a/rllib/algorithms/appo/tests/tf/test_appo_learner.py b/rllib/algorithms/appo/tests/tf/test_appo_learner.py
@@ -3,9 +3,13 @@
 
 import ray
 import ray.rllib.algorithms.appo as appo
+from ray.rllib.algorithms.appo.tf.appo_tf_learner import (
+    LEARNER_RESULTS_CURR_KL_COEFF_KEY,
+)
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
-from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch, DEFAULT_POLICY_ID
 from ray.rllib.utils.metrics import ALL_MODULES
+from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check, framework_iterator
 
@@ -36,7 +40,7 @@
 }
 
 
-class TestImpalaTfLearner(unittest.TestCase):
+class TestAPPOTfLearner(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         ray.init()
@@ -105,6 +109,45 @@ def test_appo_loss(self):
 
             check(learner_group_loss, policy_loss)
 
+    def test_kl_coeff_changes(self):
+        initial_kl_coeff = 0.01
+        config = (
+            appo.APPOConfig()
+            .environment("CartPole-v1")
+            .rollouts(
+                num_rollout_workers=0,
+                rollout_fragment_length=frag_length,
+            )
+            .resources(num_gpus=0)
+            .framework(eager_tracing=True)
+            .training(
+                gamma=0.99,
+                model=dict(
+                    fcnet_hiddens=[10, 10],
+                    fcnet_activation="linear",
+                    vf_share_layers=False,
+                ),
+                _enable_learner_api=True,
+                kl_coeff=initial_kl_coeff,
+            )
+            .rl_module(
+                _enable_rl_module_api=True,
+            )
+            .exploration(exploration_config={})
+        )
+        for _ in framework_iterator(config, ("tf2")):
+            algo = config.build()
+            # Call train while results aren't returned because this is
+            # a asynchronous trainer and results are returned asynchronously.
+            while 1:
+                results = algo.train()
+                if results:
+                    break
+            curr_kl_coeff = results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][
+                LEARNER_STATS_KEY
+            ][LEARNER_RESULTS_CURR_KL_COEFF_KEY]
+            self.assertNotEqual(curr_kl_coeff, initial_kl_coeff)
+
 
 if __name__ == "__main__":
     import pytest

diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py
@@ -18,6 +18,7 @@
 
 
 LEARNER_RESULTS_KL_KEY = "mean_kl_loss"
+LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff"
 
 
 @dataclass
@@ -72,8 +73,11 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.kl_target = self._hps.kl_target
         self.clip_param = self._hps.clip_param
-        self.kl_coeffs = defaultdict(lambda: self._hps.kl_coeff)
-        self.kl_coeff = self._hps.kl_coeff
+        # TODO: (avnishn) Make creating the kl coeff a utility function when we add
+        # torch APPO as well.
+        self.kl_coeffs = defaultdict(
+            lambda: tf.Variable(self._hps.kl_coeff, trainable=False, dtype=tf.float32)
+        )
         self.tau = self._hps.tau
 
     @override(TfLearner)
@@ -192,6 +196,7 @@ def compute_loss_per_module(
             VF_LOSS_KEY: mean_vf_loss,
             ENTROPY_KEY: mean_entropy_loss,
             LEARNER_RESULTS_KL_KEY: mean_kl_loss,
+            LEARNER_RESULTS_CURR_KL_COEFF_KEY: self.kl_coeffs[module_id],
         }
 
     @override(ImpalaTfLearner)
@@ -238,10 +243,10 @@ def _update_module_kl_coeff(
             # Update the current KL value based on the recently measured value.
             # Increase.
             if sampled_kl > 2.0 * self.kl_target:
-                self.kl_coeffs[module_id] *= 1.5
+                self.kl_coeffs[module_id].assign(self.kl_coeffs[module_id] * 1.5)
             # Decrease.
             elif sampled_kl < 0.5 * self.kl_target:
-                self.kl_coeffs[module_id] *= 0.5
+                self.kl_coeffs[module_id].assign(self.kl_coeffs[module_id] * 0.5)
 
     @override(ImpalaTfLearner)
     def additional_update_per_module(

diff --git a/rllib/tuned_examples/appo/cartpole-appo-learner.yaml b/rllib/tuned_examples/appo/cartpole-appo-learner.yaml
@@ -27,3 +27,5 @@ cartpole-appo-learner:
         eager_tracing: True
         lr: 0.001
         entropy_coeff: 0.1
+        kl_coeff: 0.01
+        exploration_config: null