[RLlib] Tf2 + eager-tracing same speed as framework=tf; Add more test…

… coverage for tf2+tracing. (#19981)
ray-project · Nov 5, 2021 · a931076 · a931076
1 parent 1341bb5
commit a931076
Show file tree

Hide file tree

Showing 25 changed files with 482 additions and 349 deletions.
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -148,7 +148,7 @@ py_test(
 )
 
 py_test(
-    name = "run_regression_tests_frozenlake_appo",
+    name = "learning_frozenlake_appo",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
     size = "large",

diff --git a/rllib/agents/a3c/tests/test_a2c.py b/rllib/agents/a3c/tests/test_a2c.py
@@ -24,8 +24,8 @@ def test_a2c_compilation(self):
         num_iterations = 1
 
         # Test against all frameworks.
-        for _ in framework_iterator(config):
-            for env in ["PongDeterministic-v0"]:
+        for _ in framework_iterator(config, with_eager_tracing=True):
+            for env in ["CartPole-v0", "Pendulum-v1", "PongDeterministic-v0"]:
                 trainer = a3c.A2CTrainer(config=config, env=env)
                 for i in range(num_iterations):
                     results = trainer.train()

diff --git a/rllib/agents/a3c/tests/test_a3c.py b/rllib/agents/a3c/tests/test_a3c.py
@@ -27,7 +27,7 @@ def test_a3c_compilation(self):
         num_iterations = 1
 
         # Test against all frameworks.
-        for _ in framework_iterator(config):
+        for _ in framework_iterator(config, with_eager_tracing=True):
             for env in ["CartPole-v1", "Pendulum-v1", "PongDeterministic-v0"]:
                 print("env={}".format(env))
                 config["model"]["use_lstm"] = env == "CartPole-v1"

diff --git a/rllib/agents/cql/tests/test_cql.py b/rllib/agents/cql/tests/test_cql.py
@@ -66,7 +66,7 @@ def test_cql_compilation(self):
         num_iterations = 4
 
         # Test for tf/torch frameworks.
-        for fw in framework_iterator(config):
+        for fw in framework_iterator(config, with_eager_tracing=True):
             trainer = cql.CQLTrainer(config=config)
             for i in range(num_iterations):
                 results = trainer.train()

diff --git a/rllib/agents/ddpg/tests/test_apex_ddpg.py b/rllib/agents/ddpg/tests/test_apex_ddpg.py
@@ -24,7 +24,7 @@ def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
         config["learning_starts"] = 0
         config["optimizer"]["num_replay_buffer_shards"] = 1
         num_iterations = 1
-        for _ in framework_iterator(config):
+        for _ in framework_iterator(config, with_eager_tracing=True):
             plain_config = config.copy()
             trainer = apex_ddpg.ApexDDPGTrainer(
                 config=plain_config, env="Pendulum-v1")

diff --git a/rllib/agents/ddpg/tests/test_ddpg.py b/rllib/agents/ddpg/tests/test_ddpg.py
@@ -41,7 +41,7 @@ def test_ddpg_compilation(self):
         num_iterations = 1
 
         # Test against all frameworks.
-        for _ in framework_iterator(config):
+        for _ in framework_iterator(config, with_eager_tracing=True):
             trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v1")
             for i in range(num_iterations):
                 results = trainer.train()

diff --git a/rllib/agents/dqn/tests/test_apex_dqn.py b/rllib/agents/dqn/tests/test_apex_dqn.py
@@ -44,7 +44,7 @@ def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
         config["min_iter_time_s"] = 1
         config["optimizer"]["num_replay_buffer_shards"] = 1
 
-        for _ in framework_iterator(config):
+        for _ in framework_iterator(config, with_eager_tracing=True):
             plain_config = config.copy()
             trainer = apex.ApexTrainer(config=plain_config, env="CartPole-v0")
 

diff --git a/rllib/agents/dqn/tests/test_simple_q.py b/rllib/agents/dqn/tests/test_simple_q.py
@@ -34,7 +34,7 @@ def test_simple_q_compilation(self):
 
         num_iterations = 2
 
-        for _ in framework_iterator(config):
+        for _ in framework_iterator(config, with_eager_tracing=True):
             trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0")
             rw = trainer.workers.local_worker()
             for i in range(num_iterations):

diff --git a/rllib/agents/impala/tests/test_impala.py b/rllib/agents/impala/tests/test_impala.py
@@ -30,7 +30,7 @@ def test_impala_compilation(self):
         num_iterations = 1
         env = "CartPole-v0"
 
-        for _ in framework_iterator(config):
+        for _ in framework_iterator(config, with_eager_tracing=True):
             local_cfg = config.copy()
             for lstm in [False, True]:
                 local_cfg["num_aggregation_workers"] = 0 if not lstm else 1

diff --git a/rllib/agents/ppo/tests/test_appo.py b/rllib/agents/ppo/tests/test_appo.py
@@ -24,7 +24,7 @@ def test_appo_compilation(self):
         config["num_workers"] = 1
         num_iterations = 2
 
-        for _ in framework_iterator(config):
+        for _ in framework_iterator(config, with_eager_tracing=True):
             print("w/o v-trace")
             _config = config.copy()
             _config["vtrace"] = False

diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
@@ -106,7 +106,7 @@ def test_ppo_compilation_and_schedule_mixins(self):
         config["compress_observations"] = True
         num_iterations = 2
 
-        for fw in framework_iterator(config):
+        for fw in framework_iterator(config, with_eager_tracing=True):
             for env in ["FrozenLake-v1", "MsPacmanNoFrameskip-v4"]:
                 print("Env={}".format(env))
                 for lstm in [True, False]:

diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py
@@ -230,14 +230,23 @@
 
     # === Deep Learning Framework Settings ===
     # tf: TensorFlow (static-graph)
-    # tf2: TensorFlow 2.x (eager)
-    # tfe: TensorFlow eager
+    # tf2: TensorFlow 2.x (eager or traced, if eager_tracing=True)
+    # tfe: TensorFlow eager (or traced, if eager_tracing=True)
     # torch: PyTorch
     "framework": "tf",
-    # Enable tracing in eager mode. This greatly improves performance, but
-    # makes it slightly harder to debug since Python code won't be evaluated
-    # after the initial eager pass. Only possible if framework=tfe.
+    # Enable tracing in eager mode. This greatly improves performance
+    # (speedup ~2x), but makes it slightly harder to debug since Python
+    # code won't be evaluated after the initial eager pass.
+    # Only possible if framework=[tf2|tfe].
     "eager_tracing": False,
+    # Maximum number of tf.function re-traces before a runtime error is raised.
+    # This is to prevent unnoticed retraces of methods inside the
+    # `..._eager_traced` Policy, which could slow down execution by a
+    # factor of 4, without the user noticing what the root cause for this
+    # slowdown could be.
+    # Only necessary for framework=[tf2|tfe].
+    # Set to None to ignore the re-trace count and never throw an error.
+    "eager_max_retraces": 20,
 
     # === Exploration Settings ===
     # Default exploration behavior, iff `explore`=None is passed into

diff --git a/rllib/examples/env/random_env.py b/rllib/examples/env/random_env.py
@@ -10,8 +10,8 @@ class RandomEnv(gym.Env):
 
     Can be instantiated with arbitrary action-, observation-, and reward
     spaces. Observations and rewards are generated by simply sampling from the
-    observation/reward spaces. The probability of a `done=True` can be
-    configured as well.
+    observation/reward spaces. The probability of a `done=True` after each
+    action can be configured, as well as the max episode length.
     """
 
     def __init__(self, config=None):
@@ -26,8 +26,13 @@ def __init__(self, config=None):
             "reward_space",
             gym.spaces.Box(low=-1.0, high=1.0, shape=(), dtype=np.float32))
         # Chance that an episode ends at any step.
+        # Note that a max episode length can be specified via
+        # `max_episode_len`.
         self.p_done = config.get("p_done", 0.1)
-        # A max episode length.
+        # A max episode length. Even if the `p_done` sampling does not lead
+        # to a terminus, the episode will end after at most this many
+        # timesteps.
+        # Set to 0 or None for using no limit on the episode length.
         self.max_episode_len = config.get("max_episode_len", None)
         # Whether to check action bounds.
         self.check_action_bounds = config.get("check_action_bounds", False)
@@ -49,11 +54,10 @@ def step(self, action):
 
         self.steps += 1
         done = False
-        # We are done as per our max-episode-len.
-        if self.max_episode_len is not None and \
-                self.steps >= self.max_episode_len:
+        # We are `done` as per our max-episode-len.
+        if self.max_episode_len and self.steps >= self.max_episode_len:
             done = True
-        # Max not reached yet -> Sample done via p_done.
+        # Max episode length not reached yet -> Sample `done` via `p_done`.
         elif self.p_done > 0.0:
             done = bool(
                 np.random.choice(

diff --git a/rllib/examples/models/batch_norm_model.py b/rllib/examples/models/batch_norm_model.py
@@ -196,7 +196,8 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
     def forward(self, input_dict, state, seq_lens):
         # Set the correct train-mode for our hidden module (only important
         # b/c we have some batch-norm layers).
-        self._hidden_layers.train(mode=input_dict.get("is_training", False))
+        self._hidden_layers.train(
+            mode=bool(input_dict.get("is_training", False)))
         self._hidden_out = self._hidden_layers(input_dict["obs"])
         logits = self._logits(self._hidden_out)
         return logits, []

diff --git a/rllib/execution/multi_gpu_impl.py b/rllib/execution/multi_gpu_impl.py
@@ -1,5 +1,11 @@
 from ray.rllib.policy.dynamic_tf_policy import TFMultiGPUTowerStack
 from ray.rllib.utils.deprecation import deprecation_warning
 
-deprecation_warning("LocalSyncParallelOptimizer", "TFMultiGPUTowerStack")
+# Backward compatibility.
+deprecation_warning(
+    old="ray.rllib.execution.multi_gpu_impl.LocalSyncParallelOptimizer",
+    new="ray.rllib.policy.dynamic_tf_policy.TFMultiGPUTowerStack",
+    error=False,
+)
+# Old name.
 LocalSyncParallelOptimizer = TFMultiGPUTowerStack
diff --git a/rllib/execution/multi_gpu_learner.py b/rllib/execution/multi_gpu_learner.py
@@ -2,6 +2,12 @@
     MultiGPULearnerThread, _MultiGPULoaderThread
 from ray.rllib.utils.deprecation import deprecation_warning
 
-deprecation_warning("multi_gpu_learner.py", "multi_gpu_learner_thread.py")
+# Backward compatibility.
+deprecation_warning(
+    old="ray.rllib.execution.multi_gpu_learner.py",
+    new="ray.rllib.execution.multi_gpu_learner_thread.py",
+    error=False,
+)
+# Old names.
 TFMultiGPULearner = MultiGPULearnerThread
 _LoaderThread = _MultiGPULoaderThread
diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py
@@ -243,6 +243,11 @@ def __call__(
         with self.context():
             res = self.forward(restored, state or [], seq_lens)
 
+        if isinstance(input_dict, SampleBatch):
+            input_dict.accessed_keys = restored.accessed_keys - {"obs_flat"}
+            input_dict.deleted_keys = restored.deleted_keys
+            input_dict.added_keys = restored.added_keys - {"obs_flat"}
+
         if ((not isinstance(res, list) and not isinstance(res, tuple))
                 or len(res) != 2):
             raise ValueError(

diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py
@@ -250,7 +250,7 @@ def __init__(
                 True, (), name="is_exploring")
 
         # Placeholder for `is_training` flag.
-        self._input_dict.is_training = self._get_is_training_placeholder()
+        self._input_dict.set_training(self._get_is_training_placeholder())
 
         # Multi-GPU towers do not need any action computing/exploration
         # graphs.
@@ -464,7 +464,7 @@ def load_batch_into_buffer(
             buffer_index: int = 0,
     ) -> int:
         # Set the is_training flag of the batch.
-        batch.is_training = True
+        batch.set_training(True)
 
         # Shortcut for 1 CPU only: Store batch in
         # `self._loaded_single_cpu_batch`.