ray-project · ericl · Apr 7, 2019 · Feb 26, 2019 · Feb 28, 2019 · Feb 28, 2019
diff --git a/python/ray/rllib/env/base_env.py b/python/ray/rllib/env/base_env.py
@@ -3,6 +3,7 @@
 from __future__ import print_function
 
 from ray.rllib.env.external_env import ExternalEnv
+from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
 from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.utils.annotations import override, PublicAPI
@@ -103,6 +104,11 @@ def to_base_env(env,
                         make_env=make_env,
                         existing_envs=[env],
                         num_envs=num_envs)
+            elif isinstance(env, ExternalMultiAgentEnv):
+                if num_envs != 1:
+                    raise ValueError(
+                        "ExternalMultiAgentEnv not currently support num_envs > 1.")
+                env = _ExternalEnvToBaseEnv(env, multiagent=True)
             elif isinstance(env, ExternalEnv):
                 if num_envs != 1:
                     raise ValueError(
@@ -196,9 +202,10 @@ def _with_dummy_agent_id(env_id_to_values, dummy_id=_DUMMY_AGENT_ID):
 class _ExternalEnvToBaseEnv(BaseEnv):
     """Internal adapter of ExternalEnv to BaseEnv."""
 
-    def __init__(self, external_env, preprocessor=None):
+    def __init__(self, external_env, preprocessor=None, multiagent=False):
         self.external_env = external_env
         self.prep = preprocessor
+        self.multiagent = multiagent
         self.action_space = external_env.action_space
         if preprocessor:
             self.observation_space = preprocessor.observation_space
@@ -223,16 +230,21 @@ def poll(self):
 
     @override(BaseEnv)
     def send_actions(self, action_dict):
-        for eid, action in action_dict.items():
-            self.external_env._episodes[eid].action_queue.put(
-                action[_DUMMY_AGENT_ID])
+        if self.multiagent:
+            for env_id, actions in action_dict.items():
+                self.external_env._episodes[env_id].action_queue.put(actions)
+        else:
+            for eid, action in action_dict.items():
+                self.external_env._episodes[eid].action_queue.put(
+                    action[_DUMMY_AGENT_ID])
 
     def _poll(self):
         all_obs, all_rewards, all_dones, all_infos = {}, {}, {}, {}
         off_policy_actions = {}
         for eid, episode in self.external_env._episodes.copy().items():
             data = episode.get_data()
-            if episode.cur_done:
+            cur_done = episode.cur_done_dict["__all__"] if self.multiagent else episode.cur_done
+            if cur_done:
                 del self.external_env._episodes[eid]
             if data:
                 if self.prep:
@@ -244,11 +256,24 @@ def _poll(self):
                 all_infos[eid] = data["info"]
                 if "off_policy_action" in data:
                     off_policy_actions[eid] = data["off_policy_action"]
-        return _with_dummy_agent_id(all_obs), \
-            _with_dummy_agent_id(all_rewards), \
-            _with_dummy_agent_id(all_dones, "__all__"), \
-            _with_dummy_agent_id(all_infos), \
-            _with_dummy_agent_id(off_policy_actions)
+        if self.multiagent:
+            # ensure a consistent set of keys
+            # rely on all_obs having all possible keys for now
+            for eid, eid_dict in all_obs.items():
+                for agent_id in eid_dict.keys():
+                    def fix(d, zero_val):
+                        if agent_id not in d[eid]:
+                            d[eid][agent_id] = zero_val
+                    fix(all_rewards, 0.0)
+                    fix(all_dones, False)
+                    fix(all_infos, {})
+            return all_obs, all_rewards, all_dones, all_infos, off_policy_actions
+        else:
+            return _with_dummy_agent_id(all_obs), \
+                _with_dummy_agent_id(all_rewards), \
+                _with_dummy_agent_id(all_dones, "__all__"), \
+                _with_dummy_agent_id(all_infos), \
+                _with_dummy_agent_id(off_policy_actions)
 
 
 class _VectorEnvToBaseEnv(BaseEnv):

diff --git a/python/ray/rllib/env/external_multi_agent_env.py b/python/ray/rllib/env/external_multi_agent_env.py
@@ -0,0 +1,257 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import queue
+import threading
+import uuid
+
+from ray.rllib.utils.annotations import PublicAPI
+from ray.rllib.env.external_env import ExternalEnv
+
+
+@PublicAPI
+class ExternalMultiAgentEnv(ExternalEnv):
+    """An environment that interfaces with external agents.
+
+    Unlike simulator envs, control is inverted. The environment queries the
+    policy to obtain actions and logs observations and rewards for training.
+    This is in contrast to gym.Env, where the algorithm drives the simulation
+    through env.step() calls.
+
+    You can use ExternalEnv as the backend for policy serving (by serving HTTP
+    requests in the run loop), for ingesting offline logs data (by reading
+    offline transitions in the run loop), or other custom use cases not easily
+    expressed through gym.Env.
+
+    ExternalEnv supports both on-policy actions (through self.get_action()),
+    and off-policy actions (through self.log_action()).
+
+    This env is thread-safe, but individual episodes must be executed serially.
+
+    Attributes:
+        action_space (gym.Space): Action space.
+        observation_space (gym.Space): Observation space.
+
+    Examples:
+        >>> register_env("my_env", lambda config: YourExternalEnv(config))
+        >>> agent = DQNAgent(env="my_env")
+        >>> while True:
+              print(agent.train())
+    """
+
+    @PublicAPI
+    def __init__(self, action_space, observation_space, max_concurrent=100):
+        """Initialize an external env.
+
+        ExternalEnv subclasses must call this during their __init__.
+
+        Arguments:
+            action_space (gym.Space): Action space of the env.
+            observation_space (gym.Space): Observation space of the env.
+            max_concurrent (int): Max number of active episodes to allow at
+                once. Exceeding this limit raises an error.
+        """
+
+        threading.Thread.__init__(self)
+        self.daemon = True
+        self.action_space = action_space
+        self.observation_space = observation_space
+        self._episodes = {}
+        self._finished = set()
+        self._results_avail_condition = threading.Condition()
+        self._max_concurrent_episodes = max_concurrent
+
+        # we require to know all agents' spaces
+        if isinstance(self.action_space, dict) or isinstance(self.observation_space, dict):
+            assert(self.action_space.keys() == self.observation_space.keys())
+
+    @PublicAPI
+    def run(self):
+        """Override this to implement the run loop.
+
+        Your loop should continuously:
+            1. Call self.start_episode(episode_id)
+            2. Call self.get_action(episode_id, obs_dict)
+                    -or-
+                    self.log_action(episode_id, obs_dict, action_dict)
+            3. Call self.log_returns(episode_id, reward_dict)
+            4. Call self.end_episode(episode_id, obs_dict)
+            5. Wait if nothing to do.
+
+        Multiple episodes may be started at the same time.
+        """
+        raise NotImplementedError
+
+    @PublicAPI
+    def start_episode(self, episode_id=None, training_enabled=True):
+        """Record the start of an episode.
+
+        Arguments:
+            episode_id (str): Unique string id for the episode or None for
+                it to be auto-assigned.
+            training_enabled (bool): Whether to use experiences for this
+                episode to improve the policy.
+
+        Returns:
+            episode_id (str): Unique string id for the episode.
+        """
+
+        if episode_id is None:
+            episode_id = uuid.uuid4().hex
+
+        if episode_id in self._finished:
+            raise ValueError(
+                "Episode {} has already completed.".format(episode_id))
+
+        if episode_id in self._episodes:
+            raise ValueError(
+                "Episode {} is already started".format(episode_id))
+
+        self._episodes[episode_id] = _ExternalEnvEpisode(
+            episode_id, self._results_avail_condition, training_enabled)
+
+        return episode_id
+
+    @PublicAPI
+    def get_action(self, episode_id, observation_dict):
+        """Record an observation and get the on-policy action.
+        observation_dict is expected to contain the observation
+        of all agents acting in this episode step.
+
+        Arguments:
+            episode_id (str): Episode id returned from start_episode().
+            observation_dict (dict): Current environment observation.
+
+        Returns:
+            action (dict): Action from the env action space.
+        """
+
+        episode = self._get(episode_id)
+        return episode.wait_for_action(observation_dict)
+
+    @PublicAPI
+    def log_action(self, episode_id, observation_dict, action_dict):
+        """Record an observation and (off-policy) action taken.
+
+        Arguments:
+            episode_id (str): Episode id returned from start_episode().
+            observation_dict (dict): Current environment observation.
+            action_dict (dict): Action for the observation.
+        """
+
+        episode = self._get(episode_id)
+        episode.log_action(observation_dict, action_dict)
+
+    @PublicAPI
+    def log_returns(self, episode_id, reward_dict, info_dict=None):
+        """Record returns from the environment.
+
+        The reward will be attributed to the previous action taken by the
+        episode. Rewards accumulate until the next action. If no reward is
+        logged before the next action, a reward of 0.0 is assumed.
+
+        Arguments:
+            episode_id (str): Episode id returned from start_episode().
+            reward_dict (dict): Reward from the environment agents.
+            info (dict): Optional info dict.
+        """
+
+        episode = self._get(episode_id)
+
+        # accumulate reward by agent
+        # for existing agents, we want to add the reward up
+        for agent, rew in reward_dict.items():
+            if agent in episode.cur_reward_dict:
+                episode.cur_reward_dict[agent] += rew
+            else:
+                episode.cur_reward_dict[agent] = rew 
+        if info_dict:
+            episode.cur_info_dict = info_dict or {}
+
+    @PublicAPI
+    def end_episode(self, episode_id, observation_dict):
+        """Record the end of an episode.
+
+        Arguments:
+            episode_id (str): Episode id returned from start_episode().
+            observation_dict (dict): Current environment observation.
+        """
+
+        episode = self._get(episode_id)
+        self._finished.add(episode.episode_id)
+        episode.done(observation_dict)
+
+    def _get(self, episode_id):
+        """Get a started episode or raise an error."""
+
+        if episode_id in self._finished:
+            raise ValueError(
+                "Episode {} has already completed.".format(episode_id))
+
+        if episode_id not in self._episodes:
+            raise ValueError("Episode {} not found.".format(episode_id))
+
+        return self._episodes[episode_id]
+
+
+class _ExternalEnvEpisode(object):
+    """
+    Tracked state for each active episode.
+    """
+
+    def reset_cur_done_dict(self, done=False):
+        self.cur_done_dict = {"__all__": done}
+
+    def __init__(self, episode_id, results_avail_condition, training_enabled):
+        self.episode_id = episode_id
+        self.results_avail_condition = results_avail_condition
+        self.training_enabled = training_enabled
+        self.data_queue = queue.Queue()
+        self.action_queue = queue.Queue()
+        self.new_observation_dict = None
+        self.new_action_dict = None
+
+        self.cur_reward_dict = {}
+
+        self.reset_cur_done_dict()
+        self.cur_info_dict = {}
+
+    def get_data(self):
+        if self.data_queue.empty():
+            return None
+        return self.data_queue.get_nowait()
+
+    def log_action(self, observation_dict, action_dict):
+        self.new_observation_dict = observation_dict
+        self.new_action_dict = action_dict
+        self._send()
+        self.action_queue.get(True, timeout=60.0)
+
+    def wait_for_action(self, observation_dict):
+        self.new_observation_dict = observation_dict
+        self._send()
+        return self.action_queue.get(True, timeout=60.0)
+
+    def done(self, observation_dict):
+        self.new_observation_dict = observation_dict
+        self.reset_cur_done_dict(True)
+        self._send()
+
+    def _send(self):
+        item = {
+            "obs": self.new_observation_dict,
+            "reward": self.cur_reward_dict,
+            "done": self.cur_done_dict,
+            "info": self.cur_info_dict,
+        }
+        if self.new_action_dict is not None:
+            item["off_policy_action"] = self.new_action_dict
+        if not self.training_enabled:
+            item["info"]["training_enabled"] = False
+        self.new_observation_dict = None
+        self.new_action_dict = None
+        self.cur_reward_dict = {}
+        with self.results_avail_condition:
+            self.data_queue.put_nowait(item)
+            self.results_avail_condition.notify()
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -13,6 +13,7 @@
 from ray.rllib.env.env_context import EnvContext
 from ray.rllib.env.external_env import ExternalEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
 from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.evaluation.interface import EvaluatorInterface
 from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
@@ -300,7 +301,8 @@ def make_env(vector_index):
 
         self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
         if self.multiagent:
-            if not (isinstance(self.env, MultiAgentEnv)
+            if not ((isinstance(self.env, MultiAgentEnv)
+                    or isinstance(self.env, ExternalMultiAgentEnv))
                     or isinstance(self.env, BaseEnv)):
                 raise ValueError(
                     "Have multiple policy graphs {}, but the env ".format(