ray-project · sven1977 · Aug 19, 2024 · Aug 2, 2024 · Aug 5, 2024 · Aug 5, 2024
@@ -338,6 +338,19 @@ py_test(
     args = ["--dir=tuned_examples/cql"]
 )
 
+py_test(
+    name = "learning_tests_pendulum_cql",
+    main = "tuned_examples/cql/pendulum_cql.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "medium",
+    srcs = ["tuned_examples/cql/pendulum_cql.py"],
+    # Include the zipped json data file as well.
+    data = [
+        "tests/data/pendulum/pendulum-v1_enormous",
+    ],
+    args = ["--as-test", "--enable-new-api-stack"]
+)
+
 # DQN
 py_test(
     name = "learning_tests_cartpole_dqn",

@@ -1,7 +1,14 @@
 import logging
-from typing import Optional, Type
+from typing import Optional, Type, Union
 
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+    AddObservationsFromEpisodesToBatch,
+)
+from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import (  # noqa
+    AddNextObservationsFromEpisodesToTrainBatch,
+)
+from ray.rllib.core.learner.learner import Learner
 from ray.rllib.algorithms.cql.cql_tf_policy import CQLTFPolicy
 from ray.rllib.algorithms.cql.cql_torch_policy import CQLTorchPolicy
 from ray.rllib.algorithms.sac.sac import (
@@ -23,15 +30,23 @@
 )
 from ray.rllib.utils.framework import try_import_tf, try_import_tfp
 from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
     LAST_TARGET_UPDATE_TS,
     NUM_AGENT_STEPS_SAMPLED,
     NUM_AGENT_STEPS_TRAINED,
     NUM_ENV_STEPS_SAMPLED,
     NUM_ENV_STEPS_TRAINED,
+    NUM_ENV_STEPS_TRAINED_LIFETIME,
+    NUM_MODULE_STEPS_TRAINED,
+    NUM_MODULE_STEPS_TRAINED_LIFETIME,
     NUM_TARGET_UPDATES,
+    OFFLINE_SAMPLING_TIMER,
     TARGET_NET_UPDATE_TIMER,
     SYNCH_WORKER_WEIGHTS_TIMER,
     SAMPLE_TIMER,
+    TIMERS,
 )
 from ray.rllib.utils.typing import ResultDict
 
@@ -122,6 +137,38 @@ def training(
 
         return self
 
+    @override(SACConfig)
+    def get_default_learner_class(self) -> Union[Type["Learner"], str]:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.cql.torch.cql_torch_learner import CQLTorchLearner
+
+            return CQLTorchLearner
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use `'torch'` instead."
+            )
+
+    @override(AlgorithmConfig)
+    def build_learner_connector(
+        self,
+        input_observation_space,
+        input_action_space,
+        device=None,
+    ):
+        pipeline = super().build_learner_connector(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            device=device,
+        )
+
+        pipeline.insert_after(
+            AddObservationsFromEpisodesToBatch,
+            AddNextObservationsFromEpisodesToTrainBatch(),
+        )
+
+        return pipeline
+
     @override(SACConfig)
     def validate(self) -> None:
         # First check, whether old `timesteps_per_iteration` is used.
@@ -150,6 +197,12 @@ def validate(self) -> None:
             )
             try_import_tfp(error=True)
 
+        # Assert that for a local learner the number of iterations is 1. Note,
+        # this is needed because we have no iterators, but instead a single
+        # batch returned directly from the `OfflineData.sample` method.
+        if self.num_learners == 0 and not self.dataset_num_iters_per_learner:
+            self.dataset_num_iters_per_learner = 1
+
 
 class CQL(SAC):
     """CQL (derived from SAC)."""
@@ -171,6 +224,77 @@ def get_default_policy_class(
 
     @override(SAC)
     def training_step(self) -> ResultDict:
+        if self.config.enable_env_runner_and_connector_v2:
+            return self._training_step_new_api_stack()
+        elif self.config.enable_rl_module_and_learner:
+            raise ValueError(
+                "Hybrid API stack is not supported. Either set "
+                "`enable_rl_module_and_learner=True` and "
+                "`enable_env_runner_and_connector_v2=True` or set both "
+                "attributed to `False`."
+            )
+        else:
+            return self._training_step_old_api_stack()
+
+    def _training_step_new_api_stack(self) -> ResultDict:
+
+        with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)):
+            # Sampling from offline data.
+            batch = self.offline_data.sample(
+                num_samples=self.config.train_batch_size_per_learner,
+                num_shards=self.config.num_learners,
+                return_iterator=True if self.config.num_learners > 1 else False,
+            )
+
+        with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+            # Updating the policy.
+            # TODO (simon, sven): Check, if we should execute directly s.th. like
+            # update_from_iterator.
+            learner_results = self.learner_group.update_from_batch(
+                batch,
+                minibatch_size=self.config.train_batch_size_per_learner,
+                num_iters=self.config.dataset_num_iters_per_learner,
+            )
+
+            # Log training results.
+            self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
+            self.metrics.log_value(
+                NUM_ENV_STEPS_TRAINED_LIFETIME,
+                self.metrics.peek(
+                    (LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED)
+                ),
+                reduce="sum",
+            )
+            self.metrics.log_dict(
+                {
+                    (LEARNER_RESULTS, mid, NUM_MODULE_STEPS_TRAINED_LIFETIME): (
+                        stats[NUM_MODULE_STEPS_TRAINED]
+                    )
+                    for mid, stats in self.metrics.peek(LEARNER_RESULTS).items()
+                },
+                reduce="sum",
+            )
+
+        # Synchronize weights.
+        # As the results contain for each policy the loss and in addition the
+        # total loss over all policies is returned, this total loss has to be
+        # removed.
+        modules_to_update = set(learner_results[0].keys()) - {ALL_MODULES}
+
+        # Update weights - after learning on the local worker -
+        # on all remote workers. Note, we only have the local `EnvRunner`,
+        # but from this `EnvRunner` the evaulation `EnvRunner`s get updated.
+        with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+            self.env_runner_group.sync_weights(
+                # Sync weights from learner_group to all EnvRunners.
+                from_worker_or_learner_group=self.learner_group,
+                policies=modules_to_update,
+                inference_only=True,
+            )
+
+        return self.metrics.reduce()
+
+    def _training_step_old_api_stack(self) -> ResultDict:
         # Collect SampleBatches from sample workers.
         with self._timers[SAMPLE_TIMER]:
             train_batch = synchronous_parallel_sample(worker_set=self.env_runner_group)

diff --git a/rllib/algorithms/cql/cql_learner.py b/rllib/algorithms/cql/cql_learner.py
@@ -0,0 +1,20 @@
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.sac.sac_learner import SACLearner
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.metrics import ALL_MODULES
+
+
+class CQLLearner(SACLearner):
+    @override(Learner)
+    def build(self) -> None:
+        # We need to call the `super()`'s `build` method here to have the variables
+        # for `alpha`` and the target entropy defined.
+        super().build()
+
+        # Add a metric to keep track of training iterations to
+        # determine when switching the actor loss from behavior
+        # cloning to SAC.
+        self.metrics.log_value(
+            (ALL_MODULES, TRAINING_ITERATION), float("nan"), window=1
+        )
-        # Add a metric to keep track of training iterations to
-        # determine when switching the actor loss from behavior
-        # cloning to SAC.
-        self.metrics.log_value(
-            (ALL_MODULES, TRAINING_ITERATION), float("nan"), window=1
-        )
+        # Add a metric to keep track of training iterations to
+        # determine when switching the actor loss from behavior
+        # cloning to SAC.
+        self.metrics.log_value(
+            (ALL_MODULES, TRAINING_ITERATION), float("nan"), window=1
+        )
-        # Add a metric to keep track of training iterations to
-        # determine when switching the actor loss from behavior
-        # cloning to SAC.
-        self.metrics.log_value(
-            (ALL_MODULES, TRAINING_ITERATION), float("nan"), window=1
-        )
+        # Add a metric to keep track of training iterations to
+        # determine when switching the actor loss from behavior
+        # cloning to SAC.
+        self.metrics.log_value(
+            (ALL_MODULES, TRAINING_ITERATION), float("nan"), window=1
+        )