ray-project · ericl · Jan 18, 2019 · Dec 23, 2018 · Dec 23, 2018 · Dec 23, 2018
@@ -88,6 +88,24 @@ SpaceInvaders  843                              ~300
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+Asynchronous Proximal Policy Optimization (APPO)
+------------------------------------------------
+
+`[paper] <https://arxiv.org/abs/1707.06347>`__
+`[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/appo.py>`__
+We include an asynchronous variant of Proximal Policy Optimization (PPO) based on the IMPALA architecture. This is similar to IMPALA but using a surrogate policy loss with clipping. Compared to synchronous PPO, APPO is more efficient in wall-clock time due to its use of asynchronous sampling. Using a clipped loss also allows for multiple SGD passes, and therefore the potential for better sample efficiency compared to IMPALA. V-trace can also be enabled to correct for off-policy samples.
+
+This implementation is currently *experimental*. Consider also using `PPO <rllib-algorithms.html#proximal-policy-optimization-ppo>`__ or `IMPALA <rllib-algorithms.html#importance-weighted-actor-learner-architecture-impala>`__.
+
+Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-appo.yaml>`__
+
+**APPO-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/ppo/appo.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Gradient-based
 ~~~~~~~~~~~~~~
 

@@ -11,7 +11,7 @@ RLlib works with several different types of environments, including `OpenAI Gym
 Algorithm      Discrete Actions         Continuous Actions  Multi-Agent  Recurrent Policies
 =============  =======================  ==================  ===========  ==================
 A2C, A3C        **Yes** `+parametric`_  **Yes**             **Yes**      **Yes**
-PPO             **Yes** `+parametric`_  **Yes**             **Yes**      **Yes**
+PPO, APPO       **Yes** `+parametric`_  **Yes**             **Yes**      **Yes**
 PG              **Yes** `+parametric`_  **Yes**             **Yes**      **Yes**
 IMPALA          **Yes** `+parametric`_  No                  **Yes**      **Yes**
 DQN, Rainbow    **Yes** `+parametric`_  No                  **Yes**      No

@@ -50,6 +50,8 @@ Algorithms
 
    -  `Importance Weighted Actor-Learner Architecture (IMPALA) <rllib-algorithms.html#importance-weighted-actor-learner-architecture-impala>`__
 
+   -  `Asynchronous Proximal Policy Optimization (APPO) <rllib-algorithms.html#asynchronous-proximal-policy-optimization-appo>`__
+
 *  Gradient-based
 
    -  `Advantage Actor-Critic (A2C, A3C) <rllib-algorithms.html#advantage-actor-critic-a2c-a3c>`__

diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py
@@ -185,7 +185,13 @@
 def with_common_config(extra_config):
     """Returns the given config dict merged with common agent confs."""
 
-    config = copy.deepcopy(COMMON_CONFIG)
+    return with_base_config(COMMON_CONFIG, extra_config)
+
+
+def with_base_config(base_config, extra_config):
+    """Returns the given config dict merged with a base agent conf."""
+
+    config = copy.deepcopy(base_config)
     config.update(extra_config)
     return config
 
@@ -491,8 +497,8 @@ def export_policy_checkpoint(self,
     @classmethod
     def resource_help(cls, config):
         return ("\n\nYou can adjust the resource requests of RLlib agents by "
-                "setting `num_workers` and other configs. See the "
-                "DEFAULT_CONFIG defined by each agent for more info.\n\n"
+                "setting `num_workers`, `num_gpus`, and other configs. See "
+                "the DEFAULT_CONFIG defined by each agent for more info.\n\n"
                 "The config of this agent is: {}".format(config))
 
     @staticmethod

diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py
@@ -100,10 +100,7 @@ def _init(self):
         for k in OPTIMIZER_SHARED_CONFIGS:
             if k not in self.config["optimizer"]:
                 self.config["optimizer"][k] = self.config[k]
-        if self.config["vtrace"]:
-            policy_cls = self._policy_graph
-        else:
-            policy_cls = A3CPolicyGraph
+        policy_cls = self._get_policy_graph()
         self.local_evaluator = self.make_local_evaluator(
             self.env_creator, policy_cls)
         self.remote_evaluators = self.make_remote_evaluators(
@@ -124,3 +121,10 @@ def _train(self):
         result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                       prev_steps)
         return result
+
+    def _get_policy_graph(self):
+        if self.config["vtrace"]:
+            policy_cls = self._policy_graph
+        else:
+            policy_cls = A3CPolicyGraph
+        return policy_cls
diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
@@ -1,6 +1,6 @@
 """Adapted from A3CPolicyGraph to add V-trace.
 
-Keep in sync with changes to A3CPolicyGraph."""
+Keep in sync with changes to A3CPolicyGraph and VtraceSurrogatePolicyGraph."""
 
 from __future__ import absolute_import
 from __future__ import division

diff --git a/python/ray/rllib/agents/ppo/__init__.py b/python/ray/rllib/agents/ppo/__init__.py
@@ -1,3 +1,4 @@
 from ray.rllib.agents.ppo.ppo import (PPOAgent, DEFAULT_CONFIG)
+from ray.rllib.agents.ppo.appo import APPOAgent
 
-__all__ = ["PPOAgent", "DEFAULT_CONFIG"]
+__all__ = ["APPOAgent", "PPOAgent", "DEFAULT_CONFIG"]
diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py
@@ -0,0 +1,65 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.agents.ppo.appo_policy_graph import AsyncPPOPolicyGraph
+from ray.rllib.agents.agent import with_base_config
+from ray.rllib.agents import impala
+from ray.rllib.utils.annotations import override
+
+# yapf: disable
+# __sphinx_doc_begin__
+DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
+    # Whether to use V-trace weighted advantages. If false, PPO GAE advantages
+    # will be used instead.
+    "vtrace": False,
+
+    # == These two options only apply if vtrace: False ==
+    # If true, use the Generalized Advantage Estimator (GAE)
+    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
+    "use_gae": True,
+    # GAE(lambda) parameter
+    "lambda": 1.0,
+
+    # == PPO surrogate loss options ==
+    "clip_param": 0.4,
+    "kl_coeff": 0.2,
+    "kl_target": 0.01,
+
+    # == IMPALA optimizer params (see documentation in impala.py) ==
+    "sample_batch_size": 50,
+    "train_batch_size": 500,
+    "min_iter_time_s": 10,
+    "num_workers": 2,
+    "num_gpus": 1,
+    "num_data_loader_buffers": 1,
+    "minibatch_buffer_size": 1,
+    "num_sgd_iter": 1,
+    "replay_proportion": 0.0,
+    "replay_buffer_num_slots": 100,
+    "max_sample_requests_in_flight_per_worker": 2,
+    "broadcast_interval": 1,
+    "grad_clip": 40.0,
+    "opt_type": "adam",
+    "lr": 0.0005,
+    "lr_schedule": None,
+    "decay": 0.99,
+    "momentum": 0.0,
+    "epsilon": 0.1,
+    "vf_loss_coeff": 0.5,
+    "entropy_coeff": -0.01,
+})
+# __sphinx_doc_end__
+# yapf: enable
+
+
+class APPOAgent(impala.ImpalaAgent):
+    """PPO surrogate loss with IMPALA-architecture."""
+
+    _agent_name = "APPO"
+    _default_config = DEFAULT_CONFIG
+    _policy_graph = AsyncPPOPolicyGraph
+
+    @override(impala.ImpalaAgent)
+    def _get_policy_graph(self):
+        return AsyncPPOPolicyGraph