ray-project · sven1977 · Aug 21, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 13, 2024
@@ -14,7 +14,7 @@
     deprecation_warning,
 )
 from ray.rllib.utils.framework import try_import_tf, try_import_tfp
-from ray.rllib.utils.typing import RLModuleSpecType, ResultDict
+from ray.rllib.utils.typing import LearningRateOrSchedule, RLModuleSpecType, ResultDict
 
 tf1, tf, tfv = try_import_tf()
 tfp = try_import_tfp()
@@ -82,6 +82,11 @@ def __init__(self, algo_class=None):
             "critic_learning_rate": 3e-4,
             "entropy_learning_rate": 3e-4,
         }
+        self.actor_lr = 3e-5
+        self.critic_lr = 3e-4
+        self.alpha_lr = 3e-4
+        # Set `lr` parameter to `None` and ensure it is not used.
+        self.lr = None
         self.grad_clip = None
         self.target_network_update_freq = 0
 
@@ -135,6 +140,9 @@ def training(
         clip_actions: Optional[bool] = NotProvided,
         grad_clip: Optional[float] = NotProvided,
         optimization_config: Optional[Dict[str, Any]] = NotProvided,
+        actor_lr: Optional[LearningRateOrSchedule] = NotProvided,
+        critic_lr: Optional[LearningRateOrSchedule] = NotProvided,
+        alpha_lr: Optional[LearningRateOrSchedule] = NotProvided,
         target_network_update_freq: Optional[int] = NotProvided,
         _deterministic_loss: Optional[bool] = NotProvided,
         _use_beta_distribution: Optional[bool] = NotProvided,
@@ -239,6 +247,56 @@ def training(
             optimization_config: Config dict for optimization. Set the supported keys
                 `actor_learning_rate`, `critic_learning_rate`, and
                 `entropy_learning_rate` in here.
+            actor_lr: The learning rate (float) or learning rate schedule for the
+                policy in the format of
+                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
+                schedule, intermediary timesteps will be assigned to linearly
+                interpolated learning rate values. A schedule config's first entry
+                must start with timestep 0, i.e.: [[0, initial_value], [...]].
+                Note: It is common practice (two-timescale approach) to use a smaller
+                learning rate for the policy than for the critic to ensure that the
+                critic gives adequate values for improving the policy.
+                Note: If you require a) more than one optimizer (per RLModule),
+                b) optimizer types that are not Adam, c) a learning rate schedule that
+                is not a linearly interpolated, piecewise schedule as described above,
+                or d) specifying c'tor arguments of the optimizer that are not the
+                learning rate (e.g. Adam's epsilon), then you must override your
+                Learner's `configure_optimizer_for_module()` method and handle
+                lr-scheduling yourself.
+                The default value is 3e-5, one decimal less than the respective
+                learning rate of the critic (see `critic_lr`).
+            critic_lr: The learning rate (float) or learning rate schedule for the
+                critic in the format of
+                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
+                schedule, intermediary timesteps will be assigned to linearly
+                interpolated learning rate values. A schedule config's first entry
+                must start with timestep 0, i.e.: [[0, initial_value], [...]].
+                Note: It is common practice (two-timescale approach) to use a smaller
+                learning rate for the policy than for the critic to ensure that the
+                critic gives adequate values for improving the policy.
+                Note: If you require a) more than one optimizer (per RLModule),
+                b) optimizer types that are not Adam, c) a learning rate schedule that
+                is not a linearly interpolated, piecewise schedule as described above,
+                or d) specifying c'tor arguments of the optimizer that are not the
+                learning rate (e.g. Adam's epsilon), then you must override your
+                Learner's `configure_optimizer_for_module()` method and handle
+                lr-scheduling yourself.
+                The default value is 3e-4, one decimal higher than the respective
+                learning rate of the actor (policy) (see `actor_lr`).
+            alpha_lr: The learning rate (float) or learning rate schedule for the
+                hyperparameter alpha in the format of
+                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
+                schedule, intermediary timesteps will be assigned to linearly
+                interpolated learning rate values. A schedule config's first entry
+                must start with timestep 0, i.e.: [[0, initial_value], [...]].
+                Note: If you require a) more than one optimizer (per RLModule),
+                b) optimizer types that are not Adam, c) a learning rate schedule that
+                is not a linearly interpolated, piecewise schedule as described above,
+                or d) specifying c'tor arguments of the optimizer that are not the
+                learning rate (e.g. Adam's epsilon), then you must override your
+                Learner's `configure_optimizer_for_module()` method and handle
+                lr-scheduling yourself.
+                The default value is 3e-4, identical to the critic learning rate (`lr`).
             target_network_update_freq: Update the target network every
                 `target_network_update_freq` steps.
             _deterministic_loss: Whether the loss should be calculated deterministically
@@ -289,6 +347,12 @@ def training(
             self.grad_clip = grad_clip
         if optimization_config is not NotProvided:
             self.optimization = optimization_config
+        if actor_lr is not NotProvided:
+            self.actor_lr = actor_lr
+        if critic_lr is not NotProvided:
+            self.critic_lr = critic_lr
+        if alpha_lr is not NotProvided:
+            self.alpha_lr = alpha_lr
         if target_network_update_freq is not NotProvided:
             self.target_network_update_freq = target_network_update_freq
         if _deterministic_loss is not NotProvided:
@@ -362,6 +426,14 @@ def validate(self) -> None:
                 "`EpisodeReplayBuffer`."
             )
 
+        if self.enable_rl_module_and_learner and self.lr:
+            raise ValueError(
+                "Basic learning rate parameter `lr` is not `None`. For SAC "
+                "use the specific learning rate parameters `actor_lr`, `critic_lr` "
+                "and `alpha_lr`, for the actor, critic, and the hyperparameter "
+                "`alpha`, respectively."
+            )
+
     @override(AlgorithmConfig)
     def get_rollout_fragment_length(self, worker_index: int = 0) -> int:
         if self.rollout_fragment_length == "auto":

@@ -58,7 +58,7 @@ def configure_optimizers_for_module(
             optimizer_name="qf",
             optimizer=optim_critic,
             params=params_critic,
-            lr_or_lr_schedule=config.lr,
+            lr_or_lr_schedule=config.critic_lr,
         )
         # If necessary register also an optimizer for a twin Q network.
         if config.twin_q:
@@ -72,7 +72,7 @@ def configure_optimizers_for_module(
                 optimizer_name="qf_twin",
                 optimizer=optim_twin_critic,
                 params=params_twin_critic,
-                lr_or_lr_schedule=config.lr,
+                lr_or_lr_schedule=config.critic_lr,
             )
 
         # Define the optimizer for the actor.
@@ -86,7 +86,7 @@ def configure_optimizers_for_module(
             optimizer_name="policy",
             optimizer=optim_actor,
             params=params_actor,
-            lr_or_lr_schedule=config.lr,
+            lr_or_lr_schedule=config.actor_lr,
         )
 
         # Define the optimizer for the temperature.
@@ -97,7 +97,7 @@ def configure_optimizers_for_module(
             optimizer_name="alpha",
             optimizer=optim_temperature,
             params=[temperature],
-            lr_or_lr_schedule=config.lr,
+            lr_or_lr_schedule=config.alpha_lr,
         )
 
     @override(DQNRainbowTorchLearner)

@@ -95,6 +95,10 @@ def stop_all(self):
     .training(
         initial_alpha=1.001,
         lr=3e-4,
+        # Choose a smaller learning rate for the actor (policy).
+        actor_lr=3e-5,
+        critic_lr=3e-4,
+        alpha_lr=1e-4,
         target_entropy="auto",
         n_step=1,
         tau=0.005,

@@ -44,7 +44,9 @@
     # Copy bottom % with top % weights.
     quantile_fraction=0.25,
     hyperparam_bounds={
-        "lr": [1e-5, 1e-3],
+        "actor_lr": [1e-5, 1e-3],
+        "critic_lr": [1e-6, 1e-4],
+        "alpha_lr": [1e-6, 1e-3],
         "gamma": [0.95, 0.99],
         "n_step": [1, 3],
         "initial_alpha": [1.0, 1.5],
@@ -80,7 +82,9 @@
         # TODO (simon): Adjust to new model_config_dict.
         .training(
             initial_alpha=tune.choice([1.0, 1.5]),
-            lr=tune.uniform(1e-5, 1e-3),
+            actor_lr=tune.uniform(1e-5, 1e-3),
+            critic_lr=tune.uniform([1e-6, 1e-4]),
+            alpha_lr=tune.uniform([1e-6, 1e-3]),
             target_entropy=tune.choice([-10, -5, -1, "auto"]),
             n_step=tune.choice([1, 3, (1, 3)]),
             tau=tune.uniform(0.001, 0.1),

@@ -27,7 +27,10 @@
     .environment(env="multi_agent_pendulum")
     .training(
         initial_alpha=1.001,
-        lr=8e-4,
+        # Use a smaller
+        actor_lr=3e-5,
+        critic_lr=3e-4,
+        alpha_lr=8e-4,
         target_entropy="auto",
         n_step=1,
         tau=0.005,

@@ -18,7 +18,10 @@
     .environment(env="Pendulum-v1")
     .training(
         initial_alpha=1.001,
-        lr=3e-4,
+        # Use a smaller learning rate for the policy.
+        actor_lr=3e-5,
+        critic_lr=3e-4,
+        alpha_lr=1e-4,
         target_entropy="auto",
         n_step=1,
         tau=0.005,