ray-project · sven1977 · Apr 20, 2022 · Mar 9, 2022 · Mar 9, 2022 · Mar 11, 2022
@@ -132,17 +132,24 @@ def sample_and_compute_grads(worker: RolloutWorker) -> Dict[str, Any]:
         # update that particular worker's weights.
         global_vars = None
         learner_info_builder = LearnerInfoBuilder(num_devices=1)
-        for worker, result in async_results.items():
-            # Apply gradients to local worker.
-            with self._timers[APPLY_GRADS_TIMER]:
-                local_worker.apply_gradients(result["grads"])
-            self._timers[APPLY_GRADS_TIMER].push_units_processed(result["agent_steps"])
-
-            # Update all step counters.
-            self._counters[NUM_AGENT_STEPS_SAMPLED] += result["agent_steps"]
-            self._counters[NUM_ENV_STEPS_SAMPLED] += result["env_steps"]
-            self._counters[NUM_AGENT_STEPS_TRAINED] += result["agent_steps"]
-            self._counters[NUM_ENV_STEPS_TRAINED] += result["env_steps"]
+        for worker, results in async_results.items():
+            for result in results:
+                # Apply gradients to local worker.
+                with self._timers[APPLY_GRADS_TIMER]:
+                    local_worker.apply_gradients(result["grads"])
+                self._timers[APPLY_GRADS_TIMER].push_units_processed(
+                    result["agent_steps"]
+                )
+
+                # Update all step counters.
+                self._counters[NUM_AGENT_STEPS_SAMPLED] += result["agent_steps"]
+                self._counters[NUM_ENV_STEPS_SAMPLED] += result["env_steps"]
+                self._counters[NUM_AGENT_STEPS_TRAINED] += result["agent_steps"]
+                self._counters[NUM_ENV_STEPS_TRAINED] += result["env_steps"]
+
+                learner_info_builder.add_learn_on_batch_results_multi_agent(
+                    result["infos"]
+                )
 
             # Create current global vars.
             global_vars = {
@@ -154,8 +161,6 @@ def sample_and_compute_grads(worker: RolloutWorker) -> Dict[str, Any]:
                 weights = local_worker.get_weights(local_worker.get_policies_to_train())
                 worker.set_weights.remote(weights, global_vars)
 
-            learner_info_builder.add_learn_on_batch_results_multi_agent(result["infos"])
-
         # Update global vars of the local worker.
         if global_vars:
             local_worker.set_global_vars(global_vars)

@@ -5,6 +5,7 @@
 from collections import defaultdict
 import gym
 from typing import DefaultDict, Optional, Type
+import tree
 
 import ray
 from ray.actor import ActorHandle
@@ -292,9 +293,10 @@ def training_iteration(self) -> ResultDict:
                 remote_fn=self._sample_and_send_to_buffer,
             )
         # Update sample counters.
-        for (env_steps, agent_steps) in sample_results.values():
-            self._counters[NUM_ENV_STEPS_SAMPLED] += env_steps
-            self._counters[NUM_AGENT_STEPS_SAMPLED] += agent_steps
+        for sample_result in sample_results.values():
+            for (env_steps, agent_steps) in sample_result:
+                self._counters[NUM_ENV_STEPS_SAMPLED] += env_steps
+                self._counters[NUM_AGENT_STEPS_SAMPLED] += agent_steps
 
         # Trigger asynchronous training update requests on all learning
         # policies.
@@ -314,11 +316,12 @@ def training_iteration(self) -> ResultDict:
             )
 
         # Update sample counters.
-        for result in train_results.values():
-            if NUM_AGENT_STEPS_TRAINED in result:
-                self._counters[NUM_AGENT_STEPS_TRAINED] += result[
-                    NUM_AGENT_STEPS_TRAINED
-                ]
+        for train_result in train_results.values():
+            for result in train_result:
+                if NUM_AGENT_STEPS_TRAINED in result:
+                    self._counters[NUM_AGENT_STEPS_TRAINED] += result[
+                        NUM_AGENT_STEPS_TRAINED
+                    ]
 
         # For those policies that have been updated in this iteration
         # (not all policies may have undergone an updated as we are
@@ -329,7 +332,13 @@ def training_iteration(self) -> ResultDict:
         with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
             train_infos = {}
             policy_weights = {}
-            for pol_actor, policy_result in train_results.items():
+            for pol_actor, policy_results in train_results.items():
+                if len(policy_results) > 1:
+                    policy_result = tree.map_structure(
+                        lambda *_args: sum(_args) / len(policy_results), *policy_results
+                    )
+                else:
+                    policy_result = policy_results[0]
                 if policy_result:
                     pid = self.distributed_learners.get_policy_id(pol_actor)
                     train_infos[pid] = policy_result

diff --git a/rllib/agents/alpha_star/tests/test_alpha_star.py b/rllib/agents/alpha_star/tests/test_alpha_star.py
@@ -19,7 +19,8 @@
 class TestAlphaStar(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(num_cpus=20)
+        # ray.init(num_cpus=20)
+        ray.init(local_mode=True)
 
     @classmethod
     def tearDownClass(cls):