ray-project · sven1977 · Mar 29, 2022 · Feb 3, 2022 · Feb 4, 2022 · Feb 4, 2022
diff --git a/release/rllib_tests/learning_tests/hard_learning_tests.yaml b/release/rllib_tests/learning_tests/hard_learning_tests.yaml
@@ -80,8 +80,6 @@ apex-breakoutnoframeskip-v4:
             epsilon_timesteps: 200000
             final_epsilon: 0.01
         prioritized_replay_alpha: 0.5
-        final_prioritized_replay_beta: 1.0
-        prioritized_replay_beta_annealing_timesteps: 2000000
         num_gpus: 1
         num_workers: 8
         num_envs_per_worker: 8
@@ -327,8 +325,6 @@ dqn-breakoutnoframeskip-v4:
             epsilon_timesteps: 200000
             final_epsilon: 0.01
         prioritized_replay_alpha: 0.5
-        final_prioritized_replay_beta: 1.0
-        prioritized_replay_beta_annealing_timesteps: 2000000
         num_gpus: 0.5
         timesteps_per_iteration: 10000
 

diff --git a/release/rllib_tests/performance_tests/performance_tests.yaml b/release/rllib_tests/performance_tests/performance_tests.yaml
@@ -53,8 +53,6 @@ apex-breakoutnoframeskip-v4:
             epsilon_timesteps: 200000
             final_epsilon: 0.01
         prioritized_replay_alpha: 0.5
-        final_prioritized_replay_beta: 1.0
-        prioritized_replay_beta_annealing_timesteps: 2000000
         num_gpus: 1
         num_workers: 8
         num_envs_per_worker: 8

@@ -19,7 +19,7 @@
         "num_workers": 32,
         "buffer_size": 2000000,
         # TODO(jungong) : update once Apex supports replay_buffer_config.
-        "replay_buffer_config": None,
+        "no_local_replay_buffer": True,
         # Whether all shards of the replay buffer must be co-located
         # with the learner process (running the execution plan).
         # This is preferred b/c the learner process should have quick

@@ -111,10 +111,6 @@
     "prioritized_replay_alpha": 0.6,
     # Beta parameter for sampling from prioritized replay buffer.
     "prioritized_replay_beta": 0.4,
-    # Time steps over which the beta parameter is annealed.
-    "prioritized_replay_beta_annealing_timesteps": 20000,
-    # Final value of beta
-    "final_prioritized_replay_beta": 0.4,
     # Epsilon to add to the TD errors when updating priorities.
     "prioritized_replay_eps": 1e-6,
     # Whether to LZ4 compress observations

@@ -66,7 +66,7 @@
         "buffer_size": 2000000,
         # TODO(jungong) : add proper replay_buffer_config after
         #     DistributedReplayBuffer type is supported.
-        "replay_buffer_config": None,
+        "no_local_replay_buffer": True,
         # Whether all shards of the replay buffer must be co-located
         # with the learner process (running the execution plan).
         # This is preferred b/c the learner process should have quick
@@ -157,9 +157,9 @@ def execution_plan(
             config["learning_starts"],
             config["buffer_size"],
             config["train_batch_size"],
-            config["prioritized_replay_alpha"],
-            config["prioritized_replay_beta"],
-            config["prioritized_replay_eps"],
+            config["replay_buffer_config"]["prioritized_replay_alpha"],
+            config["replay_buffer_config"]["prioritized_replay_beta"],
+            config["replay_buffer_config"]["prioritized_replay_eps"],
             config["multiagent"]["replay_mode"],
             config.get("replay_sequence_length", 1),
         ]

@@ -35,6 +35,7 @@
 from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY
 from ray.rllib.utils.typing import TrainerConfigDict
 from ray.util.iter import LocalIterator
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE
 
 logger = logging.getLogger(__name__)
 
@@ -64,19 +65,37 @@
         # N-step Q learning
         "n_step": 1,
 
-        # === Prioritized replay buffer ===
-        # If True prioritized replay buffer will be used.
+        # === Replay buffer ===
+        # Size of the replay buffer. Note that if async_updates is set, then
+        # each worker will have a replay buffer of this size.
+        "buffer_size": DEPRECATED_VALUE,
+        # Prioritized replay is here since this algo uses the old replay
+        # buffer api
         "prioritized_replay": True,
-        # Alpha parameter for prioritized replay buffer.
-        "prioritized_replay_alpha": 0.6,
-        # Beta parameter for sampling from prioritized replay buffer.
-        "prioritized_replay_beta": 0.4,
-        # Final value of beta (by default, we use constant beta=0.4).
-        "final_prioritized_replay_beta": 0.4,
-        # Time steps over which the beta parameter is annealed.
-        "prioritized_replay_beta_annealing_timesteps": 20000,
-        # Epsilon to add to the TD errors when updating priorities.
-        "prioritized_replay_eps": 1e-6,
+        "replay_buffer_config": {
+            # For now we don't use the new ReplayBuffer API here
+            "_enable_replay_buffer_api": False,
+            "type": "MultiAgentReplayBuffer",
+            "capacity": 50000,
+            "replay_batch_size": 32,
+            "prioritized_replay_alpha": 0.6,
+            # Beta parameter for sampling from prioritized replay buffer.
+            "prioritized_replay_beta": 0.4,
+            # Epsilon to add to the TD errors when updating priorities.
+            "prioritized_replay_eps": 1e-6,
+        },
+        # Set this to True, if you want the contents of your buffer(s) to be
+        # stored in any saved checkpoints as well.
+        # Warnings will be created if:
+        # - This is True AND restoring from a checkpoint that contains no buffer
+        #   data.
+        # - This is False AND restoring from a checkpoint that does contain
+        #   buffer data.
+        "store_buffer_in_checkpoints": False,
+        # The number of contiguous environment steps to replay at once. This may
+        # be set to greater than 1 to support recurrent models.
+        "replay_sequence_length": 1,
+
 
         # Callback to run before learning on a multi-agent batch of
         # experiences.
@@ -102,6 +121,12 @@
         # === Parallelism ===
         # Whether to compute priorities on workers.
         "worker_side_prioritization": False,
+
+        # Experimental flag.
+        # If True, the execution plan API will not be used. Instead,
+        # a Trainer's `training_iteration` method will be called as-is each
+        # training iteration.
+        "_disable_execution_plan_api": False,
     },
     _allow_unknown_configs=True,
 )

@@ -451,10 +451,14 @@ def postprocess_nstep_and_prio(
             batch[SampleBatch.DONES],
             batch[PRIO_WEIGHTS],
         )
-        new_priorities = (
-            np.abs(convert_to_numpy(td_errors))
-            + policy.config["prioritized_replay_eps"]
-        )
+        # Retain compatibility with old-style Replay args
+        epsilon = policy.config.get("replay_buffer_config", {}).get(
+            "prioritized_replay_eps"
+        ) or policy.config.get("prioritized_replay_eps")
+        if epsilon is None:
+            raise ValueError("prioritized_replay_eps not defined in config.")
+
+        new_priorities = np.abs(convert_to_numpy(td_errors)) + epsilon
         batch[PRIO_WEIGHTS] = new_priorities
 
     return batch

@@ -29,6 +29,19 @@
         # Batch mode must be complete_episodes.
         "batch_mode": "complete_episodes",
 
+        # === Replay buffer ===
+        "replay_buffer_config": {
+            # For now we don't use the new ReplayBuffer API here
+            "_enable_replay_buffer_api": False,
+            "type": "MultiAgentReplayBuffer",
+            "capacity": 50000,
+            "replay_batch_size": 32,
+            "prioritized_replay_alpha": 0.6,
+            # Beta parameter for sampling from prioritized replay buffer.
+            "prioritized_replay_beta": 0.4,
+            # Epsilon to add to the TD errors when updating priorities.
+            "prioritized_replay_eps": 1e-6,
+        },
         # If True, assume a zero-initialized state input (no matter where in
         # the episode the sequence is located).
         # If False, store the initial states along with each SampleBatch, use
@@ -66,6 +79,12 @@
 
         # Update the target network every `target_network_update_freq` steps.
         "target_network_update_freq": 2500,
+
+        # Experimental flag.
+        # If True, the execution plan API will not be used. Instead,
+        # a Trainer's `training_iteration` method will be called as-is each
+        # training iteration.
+        "_disable_execution_plan_api": False,
     },
     _allow_unknown_configs=True,
 )

@@ -15,19 +15,40 @@
 from ray.rllib.agents.dqn.simple_q_tf_policy import SimpleQTFPolicy
 from ray.rllib.agents.dqn.simple_q_torch_policy import SimpleQTorchPolicy
 from ray.rllib.agents.trainer import Trainer, with_common_config
+from ray.rllib.utils.metrics import SYNCH_WORKER_WEIGHTS_TIMER
 from ray.rllib.execution.concurrency_ops import Concurrently
 from ray.rllib.execution.metric_ops import StandardMetricsReporting
 from ray.rllib.execution.replay_ops import Replay, StoreToReplayBuffer
-from ray.rllib.execution.rollout_ops import ParallelRollouts
+from ray.rllib.execution.rollout_ops import (
+    ParallelRollouts,
+    synchronous_parallel_sample,
+)
 from ray.rllib.execution.train_ops import (
-    MultiGPUTrainOneStep,
     TrainOneStep,
+    MultiGPUTrainOneStep,
+    train_one_step,
+    multi_gpu_train_one_step,
+)
+from ray.rllib.execution.train_ops import (
     UpdateTargetNetwork,
 )
 from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import ExperimentalAPI
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.deprecation import DEPRECATED_VALUE
-from ray.rllib.utils.typing import TrainerConfigDict
+from ray.rllib.utils.metrics import (
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_SAMPLED,
+)
+from ray.rllib.utils.typing import (
+    ResultDict,
+    TrainerConfigDict,
+)
+from ray.rllib.utils.metrics import (
+    LAST_TARGET_UPDATE_TS,
+    NUM_TARGET_UPDATES,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -64,9 +85,18 @@
     # Size of the replay buffer. Note that if async_updates is set, then
     # each worker will have a replay buffer of this size.
     "buffer_size": DEPRECATED_VALUE,
+    # Deprecated for Simple Q because of new ReplayBuffer API
+    # Use MultiAgentPrioritizedReplayBuffer for prioritization.
+    "prioritized_replay": DEPRECATED_VALUE,
     "replay_buffer_config": {
+        # Use the new ReplayBuffer API here
+        "_enable_replay_buffer_api": True,
         "type": "MultiAgentReplayBuffer",
         "capacity": 50000,
+        "replay_batch_size": 32,
+        # The number of contiguous environment steps to replay at once. This
+        # may be set to greater than 1 to support recurrent models.
+        "replay_sequence_length": 1,
     },
     # Set this to True, if you want the contents of your buffer(s) to be
     # stored in any saved checkpoints as well.
@@ -76,9 +106,6 @@
     # - This is False AND restoring from a checkpoint that does contain
     #   buffer data.
     "store_buffer_in_checkpoints": False,
-    # The number of contiguous environment steps to replay at once. This may
-    # be set to greater than 1 to support recurrent models.
-    "replay_sequence_length": 1,
 
     # === Optimization ===
     # Learning rate for adam optimizer
@@ -108,6 +135,12 @@
     "num_workers": 0,
     # Prevent reporting frequency from going lower than this time span.
     "min_time_s_per_reporting": 1,
+
+    # Experimental flag.
+    # If True, the execution plan API will not be used. Instead,
+    # a Trainer's `training_iteration` method will be called as-is each
+    # training iteration.
+    "_disable_execution_plan_api": True,
 })
 # __sphinx_doc_end__
 # fmt: on
@@ -139,7 +172,9 @@ def validate_config(self, config: TrainerConfigDict) -> None:
                     " used at the same time!"
                 )
 
-        if config.get("prioritized_replay"):
+        if config.get("prioritized_replay") or config.get(
+            "replay_buffer_config", {}
+        ).get("prioritized_replay"):
             if config["multiagent"]["replay_mode"] == "lockstep":
                 raise ValueError(
                     "Prioritized replay is not supported when replay_mode=lockstep."
@@ -215,3 +250,63 @@ def execution_plan(workers, config, **kwargs):
         )
 
         return StandardMetricsReporting(train_op, workers, config)
+
+    @ExperimentalAPI
+    def training_iteration(self) -> ResultDict:
+        """Simple Q training iteration function.
+
+        Simple Q consists of the following steps:
+        - (1) Sample (MultiAgentBatch) from workers...
+        - (2) Store new samples in replay buffer.
+        - (3) Sample training batch (MultiAgentBatch) from replay buffer.
+        - (4) Learn on training batch.
+        - (5) Update target network every target_network_update_freq steps.
+        - (6) Return all collected metrics for the iteration.
+
+        Returns:
+            The results dict from executing the training iteration.
+        """
+        batch_size = self.config["train_batch_size"]
+        local_worker = self.workers.local_worker()
+
+        # (1) Sample (MultiAgentBatch) from workers
+        new_sample_batches = synchronous_parallel_sample(self.workers)
+
+        for s in new_sample_batches:
+            # Update counters
+            self._counters[NUM_ENV_STEPS_SAMPLED] += len(s)
+            self._counters[NUM_AGENT_STEPS_SAMPLED] += (
+                len(s) if isinstance(s, SampleBatch) else s.agent_steps()
+            )
+            # (2) Store new samples in replay buffer
+            self.local_replay_buffer.add(s)
+
+        # (3) Sample training batch (MultiAgentBatch) from replay buffer.
+        train_batch = self.local_replay_buffer.sample(batch_size)
+
+        # (4) Learn on training batch.
+        # Use simple optimizer (only for multi-agent or tf-eager; all other
+        # cases should use the multi-GPU optimizer, even if only using 1 GPU)
+        if self.config.get("simple_optimizer") is True:
+            train_results = train_one_step(self, train_batch)
+        else:
+            train_results = multi_gpu_train_one_step(self, train_batch)
+
+        # (5) Update target network every target_network_update_freq steps
+        cur_ts = self._counters[NUM_ENV_STEPS_SAMPLED]
+        last_update = self._counters[LAST_TARGET_UPDATE_TS]
+        if cur_ts - last_update >= self.config["target_network_update_freq"]:
+            to_update = local_worker.get_policies_to_train()
+            local_worker.foreach_policy_to_train(
+                lambda p, pid: pid in to_update and p.update_target()
+            )
+            self._counters[NUM_TARGET_UPDATES] += 1
+            self._counters[LAST_TARGET_UPDATE_TS] = cur_ts
+
+        # Update remote workers' weights after learning on local worker
+        if self.workers.remote_workers():
+            with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+                self.workers.sync_weights()
+
+        # (6) Return all collected metrics for the iteration.
+        return train_results
@@ -109,8 +109,6 @@
     "prioritized_replay_alpha": 0.6,
     "prioritized_replay_beta": 0.4,
     "prioritized_replay_eps": 1e-6,
-    "prioritized_replay_beta_annealing_timesteps": 20000,
-    "final_prioritized_replay_beta": 0.4,
     # Whether to LZ4 compress observations
     "compress_observations": False,
 

@@ -85,7 +85,7 @@ def test_sac_compilation(self):
         # If we use default buffer size (1e6), the buffer will take up
         # 169.445 GB memory, which is beyond travis-ci's current (Mar 19, 2021)
         # available system memory (8.34816 GB).
-        config["buffer_size"] = 40000
+        config["replay_buffer_config"]["capacity"] = 40000
         # Test with saved replay buffer.
         config["store_buffer_in_checkpoints"] = True
         num_iterations = 1