ray-project · sven1977 · Nov 30, 2022 · Sep 28, 2022 · Oct 12, 2022 · Oct 14, 2022
diff --git a/.buildkite/pipeline.gpu.yml b/.buildkite/pipeline.gpu.yml
@@ -6,9 +6,9 @@
 #    - TUNE_TESTING=1 ./ci/env/install-dependencies.sh
 #    - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt
 #    - ./ci/env/env_info.sh
-#    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,gpu_only python/ray/tune/...
+#    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu python/ray/tune/...
 
-- label: ":tv: :brain: RLlib: GPU Examples {A/B}"
+- label: ":tv: :brain: RLlib: GPU Tests"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT

@@ -47,9 +47,9 @@
 #   the RLlib Team.
 # - "needs_gpu": Indicating that a test needs to have a GPU in order to run.
 # - "gpu": Indicating that a test may (but doesn't have to) be run in the GPU
-#   pipeline, defined in .buildkite/pipeline.gpu.yaml.
+#   pipeline, defined in .buildkite/pipeline.gpu.yml.
 # - "multi-gpu": Indicating that a test will definitely be run in the Large GPU
-#   pipeline, defined in .buildkite/pipeline.gpu.large.yaml.
+#   pipeline, defined in .buildkite/pipeline.gpu.large.yml.
 # - "no_gpu": Indicating that a test should not be run in the GPU pipeline due
 #   to certain incompatibilities.
 # - "no_tf_eager_tracing": Exclude this test from tf-eager tracing tests.
@@ -201,6 +201,16 @@ py_test(
     args = ["--dir=tuned_examples/appo"]
 )
 
+py_test(
+    name = "learning_tests_multi_agent_cartpole_w_100_policies_appo",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py"],
+    args = ["--dir=tuned_examples/appo"]
+)
+
 # py_test(
 #    name = "learning_tests_frozenlake_appo",
 #    main = "tests/run_regression_tests.py",
@@ -1975,6 +1985,20 @@ py_test(
     srcs = ["policy/tests/test_policy.py"]
 )
 
+py_test(
+    name = "policy/tests/test_policy_map",
+    tags = ["team:rllib", "policy"],
+    size = "small",
+    srcs = ["policy/tests/test_policy_map.py"]
+)
+
+py_test(
+    name = "policy/tests/test_policy_state_swapping",
+    tags = ["team:rllib", "policy", "gpu"],
+    size = "medium",
+    srcs = ["policy/tests/test_policy_state_swapping.py"]
+)
+
 py_test(
     name = "policy/tests/test_rnn_sequencing",
     tags = ["team:rllib", "policy"],

@@ -3,7 +3,17 @@
 from gym.spaces import Space
 import logging
 import math
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Type, Union
+from typing import (
+    Any,
+    Callable,
+    Container,
+    Dict,
+    Optional,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
 
 import ray
 from ray.rllib.evaluation.rollout_worker import RolloutWorker
@@ -12,6 +22,7 @@
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.evaluation.collectors.sample_collector import SampleCollector
 from ray.rllib.evaluation.collectors.simple_list_collector import SimpleListCollector
+from ray.rllib.evaluation.episode import Episode
 from ray.rllib.models import MODEL_DEFAULTS
 from ray.rllib.policy.policy import Policy, PolicySpec
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
@@ -28,6 +39,7 @@
 from ray.rllib.utils.from_config import from_config
 from ray.rllib.utils.policy import validate_policy_id
 from ray.rllib.utils.typing import (
+    AgentID,
     AlgorithmConfigDict,
     EnvConfigDict,
     EnvType,
@@ -267,11 +279,11 @@ def __init__(self, algo_class=None):
         # `self.multi_agent()`
         self.policies = {DEFAULT_POLICY_ID: PolicySpec()}
         self.policy_map_capacity = 100
-        self.policy_map_cache = None
         self.policy_mapping_fn = (
             lambda aid, episode, worker, **kwargs: DEFAULT_POLICY_ID
         )
         self.policies_to_train = None
+        self.policy_states_are_swappable = False
         self.observation_fn = None
         self.count_steps_by = "env_steps"
 
@@ -344,6 +356,12 @@ def __init__(self, algo_class=None):
         self.timesteps_per_iteration = DEPRECATED_VALUE
         self.min_iter_time_s = DEPRECATED_VALUE
         self.collect_metrics_timeout = DEPRECATED_VALUE
+        self.min_time_s_per_reporting = DEPRECATED_VALUE
+        self.min_train_timesteps_per_reporting = DEPRECATED_VALUE
+        self.min_sample_timesteps_per_reporting = DEPRECATED_VALUE
+        self.input_evaluation = DEPRECATED_VALUE
+        self.policy_map_cache = DEPRECATED_VALUE
+
         # The following values have moved because of the new ReplayBuffer API
         self.buffer_size = DEPRECATED_VALUE
         self.prioritized_replay = DEPRECATED_VALUE
@@ -358,7 +376,6 @@ def __init__(self, algo_class=None):
         self.min_time_s_per_reporting = DEPRECATED_VALUE
         self.min_train_timesteps_per_reporting = DEPRECATED_VALUE
         self.min_sample_timesteps_per_reporting = DEPRECATED_VALUE
-        self.input_evaluation = DEPRECATED_VALUE
         self.horizon = DEPRECATED_VALUE
         self.soft_horizon = DEPRECATED_VALUE
 
@@ -458,9 +475,9 @@ def update_from_dict(
                     for k in [
                         "policies",
                         "policy_map_capacity",
-                        "policy_map_cache",
                         "policy_mapping_fn",
                         "policies_to_train",
+                        "policy_states_are_swappable",
                         "observation_fn",
                         "count_steps_by",
                     ]
@@ -1601,13 +1618,21 @@ def multi_agent(
         self,
         *,
         policies=NotProvided,
-        policy_map_capacity=NotProvided,
-        policy_map_cache=NotProvided,
-        policy_mapping_fn=NotProvided,
-        policies_to_train=NotProvided,
-        observation_fn=NotProvided,
-        count_steps_by=NotProvided,
+        policy_map_capacity: Optional[int] = NotProvided,
+        policy_mapping_fn: Optional[
+            Callable[[AgentID, "Episode"], PolicyID]
+        ] = NotProvided,
+        policies_to_train: Optional[
+            Union[Container[PolicyID], Callable[[PolicyID, SampleBatchType], bool]]
+        ] = NotProvided,
+        policy_states_are_swappable: Optional[bool] = NotProvided,
+        observation_fn: Optional[Callable] = NotProvided,
+        count_steps_by: Optional[str] = NotProvided,
+        # Deprecated args:
         replay_mode=DEPRECATED_VALUE,
+        # Now done via Ray object store, which has its own cloud-supported
+        # spillover mechanism.
+        policy_map_cache=DEPRECATED_VALUE,
     ) -> "AlgorithmConfig":
         """Sets the config's multi-agent settings.
 
@@ -1622,9 +1647,6 @@ def multi_agent(
                 observation- and action spaces of the policies, and any extra config.
             policy_map_capacity: Keep this many policies in the "policy_map" (before
                 writing least-recently used ones to disk/S3).
-            policy_map_cache: Where to store overflowing (least-recently used) policies?
-                Could be a directory (str) or an S3 location. None for using the
-                default output dir.
             policy_mapping_fn: Function mapping agent ids to policy ids. The signature
                 is: `(agent_id, episode, worker, **kwargs) -> PolicyID`.
             policies_to_train: Determines those policies that should be updated.
@@ -1636,6 +1658,19 @@ def multi_agent(
                 or not, given the particular batch). This allows you to have a policy
                 trained only on certain data (e.g. when playing against a certain
                 opponent).
+            policy_states_are_swappable: Whether all Policy objects in this map can be
+                "swapped out" via a simple `state = A.get_state(); B.set_state(state)`,
+                where `A` and `B` are policy instances in this map. You should set
+                this to True for significantly speeding up the PolicyMap's cache lookup
+                times, iff your policies all share the same neural network
+                architecture and optimizer types. If True, the PolicyMap will not
+                have to garbage collect old, least recently used policies, but instead
+                keep them in memory and simply override their state with the state of
+                the most recently accessed one.
+                For example, in a league-based training setup, you might have 100s of
+                the same policies in your map (playing against each other in various
+                combinations), but all of them share the same state structure
+                (are "swappable").
             observation_fn: Optional function that can be used to enhance the local
                 agent observations to include more state. See
                 rllib/evaluation/observation_function.py for more info.
@@ -1681,9 +1716,6 @@ def multi_agent(
         if policy_map_capacity is not NotProvided:
             self.policy_map_capacity = policy_map_capacity
 
-        if policy_map_cache is not NotProvided:
-            self.policy_map_cache = policy_map_cache
-
         if policy_mapping_fn is not NotProvided:
             # Attempt to create a `policy_mapping_fn` from config dict. Helpful
             # is users would like to specify custom callable classes in yaml files.
@@ -1694,6 +1726,12 @@ def multi_agent(
         if observation_fn is not NotProvided:
             self.observation_fn = observation_fn
 
+        if policy_map_cache != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="AlgorithmConfig.multi_agent(policy_map_cache=..)",
+                error=True,
+            )
+
         if replay_mode != DEPRECATED_VALUE:
             deprecation_warning(
                 old="AlgorithmConfig.multi_agent(replay_mode=..)",
@@ -1730,6 +1768,9 @@ def multi_agent(
                     )
             self.policies_to_train = policies_to_train
 
+        if policy_states_are_swappable is not None:
+            self.policy_states_are_swappable = policy_states_are_swappable
+
         return self
 
     def is_multi_agent(self) -> bool:

@@ -541,7 +541,10 @@ def sample_from_replay_buffer_place_on_learner_queue_non_blocking(
         """
 
         def wait_on_replay_actors() -> List[Tuple[int, SampleBatchType]]:
-            """Wait for the replay actors to finish sampling for timeout seconds."""
+            """Wait for the replay actors to finish sampling for timeout seconds.
+
+            If the timeout is None, then block on the actors indefinitely.
+            """
             results = self._replay_actor_manager.fetch_ready_async_reqs(
                 timeout_seconds=self._replay_req_timeout_s
             )

@@ -484,14 +484,13 @@ def test_loss_coef(self):
                 config2 = config.copy()
                 config2[f"loss_coef_{key}"] = 10.0
                 policy2 = DTTorchPolicy(observation_space, action_space, config2)
-                # copy the weights over so they output the same loss without scaling
-                policy2.set_state(policy1.get_state())
+                # Copy the weights over so they output the same loss without scaling.
                 policy2.set_weights(policy1.get_weights())
 
                 loss2 = policy2.loss(policy2.model, policy2.dist_class, batch)
                 loss2 = loss2.detach().cpu().item()
 
-                # compare loss, should be factor of 10 difference
+                # Compare loss, should be factor of 10 difference.
                 self.assertAlmostEqual(
                     loss2 / loss1,
                     10.0,