From edfe921db4fd743fbb7b988670048019b20448c9 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Mon, 15 Apr 2024 20:07:32 -0600
Subject: [PATCH 01/16] initial

---
 .../classes/pyflyt_quadx_waypoints_env.py     | 29 +++++++
 rllib/examples/quadx_waypoints.py             | 87 +++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
 create mode 100644 rllib/examples/quadx_waypoints.py

diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
new file mode 100644
index 000000000000..a7a34ffe2eab
--- /dev/null
+++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
@@ -0,0 +1,29 @@
+from ray.tune.registry import register_env
+from gymnasium.wrappers import RecordVideo
+from PyFlyt.gym_envs import FlattenWaypointEnv
+from gymnasium.wrappers import TransformReward
+
+import gymnasium as gym
+import PyFlyt.gym_envs # noqa
+
+class RewardWrapper(gym.RewardWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+    def reward(self, reward):
+        # Scale rewards:
+        if reward >= 99.0 or reward <= -99.0:
+            return reward / 10
+        return reward
+
+class QuadXWayPointsEnv(gym.Env):
+    from gymnasium.wrappers import RecordVideo
+    import PyFlyt.gym_envs # Must be here
+    from PyFlyt.gym_envs import FlattenWaypointEnv
+    from gymnasium.wrappers import TransformReward
+    
+    def __init__(self, config=None):
+        env = gym.make("PyFlyt/QuadX-Waypoints-v1")
+        # Wrap Environment to use max 10 and -10 for rewards
+        env = RewardWrapper(env)
+        
+        self.env = FlattenWaypointEnv(env, context_length=1)
\ No newline at end of file
diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
new file mode 100644
index 000000000000..69be9348d64b
--- /dev/null
+++ b/rllib/examples/quadx_waypoints.py
@@ -0,0 +1,87 @@
+# TODO (sven): Move this example script into the new API stack.
+# TODO (sven): Move this script to `examples/rl_modules/...`
+
+import argparse
+import os
+
+from ray.rllib.examples.env.pyflyt_quadx_waypoints_env import QuadXWayPointsEnv
+from ray.rllib.utils.test_utils import check_learning_achieved
+from ray.tune.registry import get_trainable_cls
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
+)
+parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument("--use-prev-action", action="store_true")
+parser.add_argument("--use-prev-reward", action="store_true")
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=500, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=90.0, help="Reward at which we stop training."
+)
+
+if __name__ == "__main__":
+    import ray
+    from ray import air, tune
+
+    args = parser.parse_args()
+
+    ray.init()
+
+    algo_cls = get_trainable_cls(args.run)
+    config = algo_cls.get_default_config()
+
+    config.environment(env=QuadXWayPointsEnv).resources(
+        num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))
+    ).framework(args.framework).reporting(min_time_s_per_iteration=0.1).training(
+        model={
+            "use_lstm": True,
+            "lstm_cell_size": 32,
+            "lstm_use_prev_action": args.use_prev_action,
+            "lstm_use_prev_reward": args.use_prev_reward,
+        }
+    )
+
+    if args.run == "PPO":
+        config.training(num_sgd_iter=5, vf_loss_coeff=0.0001, train_batch_size=512)
+        config.model["vf_share_layers"] = True
+    elif args.run == "IMPALA":
+        config.rollouts(num_rollout_workers=2)
+        config.resources(num_gpus=0)
+        config.training(vf_loss_coeff=0.01)
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    tuner = tune.Tuner(
+        args.run,
+        param_space=config.to_dict(),
+        run_config=air.RunConfig(
+            stop=stop,
+        ),
+    )
+    results = tuner.fit()
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()

From 53aa9849df2e03406ce7c3902189a5af86ec701f Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 16 Apr 2024 08:54:12 -0600
Subject: [PATCH 02/16] Lint

---
 .../envs/classes/pyflyt_quadx_waypoints_env.py      | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
index a7a34ffe2eab..c51b95d6e23c 100644
--- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
+++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
@@ -4,26 +4,29 @@
 from gymnasium.wrappers import TransformReward
 
 import gymnasium as gym
-import PyFlyt.gym_envs # noqa
+import PyFlyt.gym_envs  # noqa
+
 
 class RewardWrapper(gym.RewardWrapper):
     def __init__(self, env):
         super().__init__(env)
+
     def reward(self, reward):
         # Scale rewards:
         if reward >= 99.0 or reward <= -99.0:
             return reward / 10
         return reward
 
+
 class QuadXWayPointsEnv(gym.Env):
     from gymnasium.wrappers import RecordVideo
-    import PyFlyt.gym_envs # Must be here
+    import PyFlyt.gym_envs  # Must be here
     from PyFlyt.gym_envs import FlattenWaypointEnv
     from gymnasium.wrappers import TransformReward
-    
+
     def __init__(self, config=None):
         env = gym.make("PyFlyt/QuadX-Waypoints-v1")
         # Wrap Environment to use max 10 and -10 for rewards
         env = RewardWrapper(env)
-        
-        self.env = FlattenWaypointEnv(env, context_length=1)
\ No newline at end of file
+
+        self.env = FlattenWaypointEnv(env, context_length=1)

From 4c13d0535e48df9e853cb87675eb396fff5cebc7 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 16 Apr 2024 17:51:36 -0600
Subject: [PATCH 03/16] lint

---
 .../envs/classes/pyflyt_quadx_waypoints_env.py         | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
index c51b95d6e23c..3bec2fc6981a 100644
--- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
+++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
@@ -1,10 +1,5 @@
-from ray.tune.registry import register_env
-from gymnasium.wrappers import RecordVideo
 from PyFlyt.gym_envs import FlattenWaypointEnv
-from gymnasium.wrappers import TransformReward
-
 import gymnasium as gym
-import PyFlyt.gym_envs  # noqa
 
 
 class RewardWrapper(gym.RewardWrapper):
@@ -19,10 +14,7 @@ def reward(self, reward):
 
 
 class QuadXWayPointsEnv(gym.Env):
-    from gymnasium.wrappers import RecordVideo
-    import PyFlyt.gym_envs  # Must be here
-    from PyFlyt.gym_envs import FlattenWaypointEnv
-    from gymnasium.wrappers import TransformReward
+    import PyFlyt.gym_envs  # noqa
 
     def __init__(self, config=None):
         env = gym.make("PyFlyt/QuadX-Waypoints-v1")

From bedb196e1cc3c7ec431249fb352f3af1a21f4437 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Wed, 17 Apr 2024 07:57:02 -0600
Subject: [PATCH 04/16] lint

---
 .../envs/classes/pyflyt_quadx_waypoints_env.py     | 13 ++++++-------
 rllib/examples/quadx_waypoints.py                  | 14 ++++++++------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
index 3bec2fc6981a..f151ed3d1023 100644
--- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
+++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
@@ -1,4 +1,3 @@
-from PyFlyt.gym_envs import FlattenWaypointEnv
 import gymnasium as gym
 
 
@@ -13,12 +12,12 @@ def reward(self, reward):
         return reward
 
 
-class QuadXWayPointsEnv(gym.Env):
+def create_quadx_waypoints_env(env_config):
     import PyFlyt.gym_envs  # noqa
+    from PyFlyt.gym_envs import FlattenWaypointEnv
 
-    def __init__(self, config=None):
-        env = gym.make("PyFlyt/QuadX-Waypoints-v1")
-        # Wrap Environment to use max 10 and -10 for rewards
-        env = RewardWrapper(env)
+    env = gym.make("PyFlyt/QuadX-Waypoints-v1")
+    # Wrap Environment to use max 10 and -10 for rewards
+    env = RewardWrapper(env)
 
-        self.env = FlattenWaypointEnv(env, context_length=1)
+    return FlattenWaypointEnv(env, context_length=1)
diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 69be9348d64b..761b00459df9 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -1,10 +1,9 @@
-# TODO (sven): Move this example script into the new API stack.
-# TODO (sven): Move this script to `examples/rl_modules/...`
-
 import argparse
 import os
 
-from ray.rllib.examples.env.pyflyt_quadx_waypoints_env import QuadXWayPointsEnv
+from ray.rllib.examples.envs.classes.pyflyt_quadx_waypoints_env import (
+    create_quadx_waypoints_env,
+)
 from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.tune.registry import get_trainable_cls
 
@@ -12,7 +11,7 @@
 parser.add_argument(
     "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
 )
-parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument("--num-cpus", type=int, default=4)
 parser.add_argument(
     "--framework",
     choices=["tf", "tf2", "torch"],
@@ -40,15 +39,18 @@
 if __name__ == "__main__":
     import ray
     from ray import air, tune
+    from ray.tune.registry import register_env
 
     args = parser.parse_args()
 
     ray.init()
 
+    register_env("quadx_waypoints", create_quadx_waypoints_env)
+
     algo_cls = get_trainable_cls(args.run)
     config = algo_cls.get_default_config()
 
-    config.environment(env=QuadXWayPointsEnv).resources(
+    config.environment(env="quadx_waypoints").resources(
         num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))
     ).framework(args.framework).reporting(min_time_s_per_iteration=0.1).training(
         model={

From 7cfaea3336c422b9357d0ad6956484c0c6288cb4 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 14 May 2024 06:38:16 -0600
Subject: [PATCH 05/16] Remove file

---
 .../classes/pyflyt_quadx_waypoints_env.py     | 23 ---------------
 rllib/examples/quadx_waypoints.py             | 28 ++++++++++++++++---
 2 files changed, 24 insertions(+), 27 deletions(-)
 delete mode 100644 rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py

diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
deleted file mode 100644
index f151ed3d1023..000000000000
--- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import gymnasium as gym
-
-
-class RewardWrapper(gym.RewardWrapper):
-    def __init__(self, env):
-        super().__init__(env)
-
-    def reward(self, reward):
-        # Scale rewards:
-        if reward >= 99.0 or reward <= -99.0:
-            return reward / 10
-        return reward
-
-
-def create_quadx_waypoints_env(env_config):
-    import PyFlyt.gym_envs  # noqa
-    from PyFlyt.gym_envs import FlattenWaypointEnv
-
-    env = gym.make("PyFlyt/QuadX-Waypoints-v1")
-    # Wrap Environment to use max 10 and -10 for rewards
-    env = RewardWrapper(env)
-
-    return FlattenWaypointEnv(env, context_length=1)
diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 761b00459df9..caafd6116edd 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -1,11 +1,9 @@
 import argparse
 import os
 
-from ray.rllib.examples.envs.classes.pyflyt_quadx_waypoints_env import (
-    create_quadx_waypoints_env,
-)
 from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.tune.registry import get_trainable_cls
+import gymnasium as gym
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -36,6 +34,28 @@
     "--stop-reward", type=float, default=90.0, help="Reward at which we stop training."
 )
 
+class RewardWrapper(gym.RewardWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+
+    def reward(self, reward):
+        # Scale rewards:
+        if reward >= 99.0 or reward <= -99.0:
+            return reward / 10
+        return reward
+
+
+def create_quadx_waypoints_env(env_config):
+    import PyFlyt.gym_envs  # noqa
+    from PyFlyt.gym_envs import FlattenWaypointEnv
+
+    env = gym.make("PyFlyt/QuadX-Waypoints-v1")
+    # Wrap Environment to use max 10 and -10 for rewards
+    env = RewardWrapper(env)
+
+    return FlattenWaypointEnv(env, context_length=1)
+
+
 if __name__ == "__main__":
     import ray
     from ray import air, tune
@@ -45,7 +65,7 @@
 
     ray.init()
 
-    register_env("quadx_waypoints", create_quadx_waypoints_env)
+    register_env("quadx_waypoints", env_creator=create_quadx_waypoints_env)
 
     algo_cls = get_trainable_cls(args.run)
     config = algo_cls.get_default_config()

From 803c129cd715193a4d55e31862341d32838d11f7 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 14 May 2024 11:44:13 -0600
Subject: [PATCH 06/16] Address feedback

---
 rllib/examples/quadx_waypoints.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index caafd6116edd..e0857ef2927d 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -12,7 +12,7 @@
 parser.add_argument("--num-cpus", type=int, default=4)
 parser.add_argument(
     "--framework",
-    choices=["tf", "tf2", "torch"],
+    choices=["tf2", "torch"],
     default="torch",
     help="The DL framework specifier.",
 )
@@ -70,15 +70,24 @@ def create_quadx_waypoints_env(env_config):
     algo_cls = get_trainable_cls(args.run)
     config = algo_cls.get_default_config()
 
-    config.environment(env="quadx_waypoints").resources(
+    config.environment(
+        env="quadx_waypoints"
+    ).resources(
         num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))
-    ).framework(args.framework).reporting(min_time_s_per_iteration=0.1).training(
-        model={
+    ).framework(
+        args.framework
+    ).api_stack(
+            enable_rl_module_and_learner=True,
+            enable_env_runner_and_connector_v2=True,
+    ).rl_module(
+        model_config_dict={
             "use_lstm": True,
             "lstm_cell_size": 32,
             "lstm_use_prev_action": args.use_prev_action,
             "lstm_use_prev_reward": args.use_prev_reward,
         }
+    ).reporting(
+        min_time_s_per_iteration=0.1
     )
 
     if args.run == "PPO":
@@ -92,7 +101,7 @@ def create_quadx_waypoints_env(env_config):
     stop = {
         "training_iteration": args.stop_iters,
         "timesteps_total": args.stop_timesteps,
-        "episode_reward_mean": args.stop_reward,
+        "episode_return_mean": args.stop_reward,
     }
 
     tuner = tune.Tuner(

From 91e6f4fd2adc5bea30f5243f4f53dae6c4c2a32a Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 14 May 2024 15:21:07 -0600
Subject: [PATCH 07/16] Address feedback

---
 rllib/examples/quadx_waypoints.py | 44 ++++++++++++++-----------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index e0857ef2927d..0152812e1760 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -9,27 +9,19 @@
 parser.add_argument(
     "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
 )
+parser.add_argument('--env-name', type=str, default="quadx_waypoints")
 parser.add_argument("--num-cpus", type=int, default=4)
+parser.add_argument("--num-envs-per-worker", type=int, default=4)
 parser.add_argument(
     "--framework",
     choices=["tf2", "torch"],
     default="torch",
     help="The DL framework specifier.",
 )
-parser.add_argument("--use-prev-action", action="store_true")
-parser.add_argument("--use-prev-reward", action="store_true")
-parser.add_argument(
-    "--as-test",
-    action="store_true",
-    help="Whether this script should be run as a test: --stop-reward must "
-    "be achieved within --stop-timesteps AND --stop-iters.",
-)
+
 parser.add_argument(
     "--stop-iters", type=int, default=500, help="Number of iterations to train."
 )
-parser.add_argument(
-    "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train."
-)
 parser.add_argument(
     "--stop-reward", type=float, default=90.0, help="Reward at which we stop training."
 )
@@ -65,34 +57,39 @@ def create_quadx_waypoints_env(env_config):
 
     ray.init()
 
-    register_env("quadx_waypoints", env_creator=create_quadx_waypoints_env)
+    register_env(args.env_name, env_creator=create_quadx_waypoints_env)
 
     algo_cls = get_trainable_cls(args.run)
     config = algo_cls.get_default_config()
 
     config.environment(
-        env="quadx_waypoints"
+        env=args.env_name
     ).resources(
         num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))
+    ).rollouts(
+        num_rollout_workers=args.num_cpus,
+        num_envs_per_worker=args.num_envs_per_worker,
     ).framework(
         args.framework
     ).api_stack(
             enable_rl_module_and_learner=True,
             enable_env_runner_and_connector_v2=True,
-    ).rl_module(
-        model_config_dict={
-            "use_lstm": True,
-            "lstm_cell_size": 32,
-            "lstm_use_prev_action": args.use_prev_action,
-            "lstm_use_prev_reward": args.use_prev_reward,
-        }
     ).reporting(
         min_time_s_per_iteration=0.1
     )
 
     if args.run == "PPO":
-        config.training(num_sgd_iter=5, vf_loss_coeff=0.0001, train_batch_size=512)
-        config.model["vf_share_layers"] = True
+        config.rl_module(
+            model_config_dict={
+                "fcnet_hiddens": [32],
+                "fcnet_activation": "linear",
+                "vf_share_layers": True,
+            }
+        )  
+        config.training(
+            sgd_minibatch_size=128,
+            train_batch_size=10000,
+        )
     elif args.run == "IMPALA":
         config.rollouts(num_rollout_workers=2)
         config.resources(num_gpus=0)
@@ -100,8 +97,7 @@ def create_quadx_waypoints_env(env_config):
 
     stop = {
         "training_iteration": args.stop_iters,
-        "timesteps_total": args.stop_timesteps,
-        "episode_return_mean": args.stop_reward,
+        "env_runner_results/episode_return_mean": args.stop_reward,
     }
 
     tuner = tune.Tuner(

From 54e9c8b65450287079d0e31678e4422b83f7c01a Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 14 May 2024 16:26:11 -0600
Subject: [PATCH 08/16] Clean up

---
 rllib/examples/quadx_waypoints.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 0152812e1760..bf15d89aea63 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -19,6 +19,8 @@
     help="The DL framework specifier.",
 )
 
+parser.add_argument("--as-test", type=bool, default=True)
+
 parser.add_argument(
     "--stop-iters", type=int, default=500, help="Number of iterations to train."
 )
@@ -54,6 +56,7 @@ def create_quadx_waypoints_env(env_config):
     from ray.tune.registry import register_env
 
     args = parser.parse_args()
+    num_gpus = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
 
     ray.init()
 
@@ -65,15 +68,16 @@ def create_quadx_waypoints_env(env_config):
     config.environment(
         env=args.env_name
     ).resources(
-        num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))
+        num_learner_workers=num_gpus,
+        num_gpus_per_learner_worker=num_gpus,
     ).rollouts(
         num_rollout_workers=args.num_cpus,
         num_envs_per_worker=args.num_envs_per_worker,
     ).framework(
         args.framework
     ).api_stack(
-            enable_rl_module_and_learner=True,
-            enable_env_runner_and_connector_v2=True,
+        enable_rl_module_and_learner=True,
+        enable_env_runner_and_connector_v2=True,
     ).reporting(
         min_time_s_per_iteration=0.1
     )

From 0bcdb2e95db0bd5ae35df38c3e4beacf88a3c1b6 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Wed, 22 May 2024 10:10:24 -0600
Subject: [PATCH 09/16] feedback

---
 rllib/examples/quadx_waypoints.py | 68 ++++++++++++++++---------------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index bf15d89aea63..5e92a842e5ad 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -1,32 +1,42 @@
-import argparse
+"""Example using the PyFlyt Gymnasium environment to train a UAV to reach waypoints.
+
+PyFlyt GitHub Repository: https://github.com/jjshoots/PyFlyt/tree/master/PyFlyt
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+"""
+
 import os
 
 from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.tune.registry import get_trainable_cls
 import gymnasium as gym
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
 
-parser = argparse.ArgumentParser()
+parser = add_rllib_example_script_args(
+    default_iters=200,
+    default_timesteps=100000,
+    default_reward=90.0,
+)
 parser.add_argument(
     "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
 )
 parser.add_argument('--env-name', type=str, default="quadx_waypoints")
-parser.add_argument("--num-cpus", type=int, default=4)
 parser.add_argument("--num-envs-per-worker", type=int, default=4)
-parser.add_argument(
-    "--framework",
-    choices=["tf2", "torch"],
-    default="torch",
-    help="The DL framework specifier.",
-)
-
-parser.add_argument("--as-test", type=bool, default=True)
-
-parser.add_argument(
-    "--stop-iters", type=int, default=500, help="Number of iterations to train."
-)
-parser.add_argument(
-    "--stop-reward", type=float, default=90.0, help="Reward at which we stop training."
-)
 
 class RewardWrapper(gym.RewardWrapper):
     def __init__(self, env):
@@ -58,8 +68,6 @@ def create_quadx_waypoints_env(env_config):
     args = parser.parse_args()
     num_gpus = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
 
-    ray.init()
-
     register_env(args.env_name, env_creator=create_quadx_waypoints_env)
 
     algo_cls = get_trainable_cls(args.run)
@@ -101,18 +109,14 @@ def create_quadx_waypoints_env(env_config):
 
     stop = {
         "training_iteration": args.stop_iters,
-        "env_runner_results/episode_return_mean": args.stop_reward,
+        "env_runners/episode_reward_mean": args.stop_reward,
     }
 
-    tuner = tune.Tuner(
-        args.run,
-        param_space=config.to_dict(),
-        run_config=air.RunConfig(
-            stop=stop,
-        ),
+    run_rllib_example_script_experiment(
+        config,
+        args,
+        stop=stop,
+        success_metric={
+            "env_runners/episode_reward_mean": args.stop_reward,
+        },
     )
-    results = tuner.fit()
-
-    if args.as_test:
-        check_learning_achieved(results, args.stop_reward)
-    ray.shutdown()

From 2d4cc64a4d7f6c3313693332ab04125fc4a76aad Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Wed, 22 May 2024 10:32:57 -0600
Subject: [PATCH 10/16] lint

Signed-off-by: peterghaddad <peter.g.haddad@lmco.com>
---
 rllib/examples/quadx_waypoints.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 5e92a842e5ad..21a09be524bb 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -35,9 +35,10 @@
 parser.add_argument(
     "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
 )
-parser.add_argument('--env-name', type=str, default="quadx_waypoints")
+parser.add_argument("--env-name", type=str, default="quadx_waypoints")
 parser.add_argument("--num-envs-per-worker", type=int, default=4)
 
+
 class RewardWrapper(gym.RewardWrapper):
     def __init__(self, env):
         super().__init__(env)
@@ -73,9 +74,7 @@ def create_quadx_waypoints_env(env_config):
     algo_cls = get_trainable_cls(args.run)
     config = algo_cls.get_default_config()
 
-    config.environment(
-        env=args.env_name
-    ).resources(
+    config.environment(env=args.env_name).resources(
         num_learner_workers=num_gpus,
         num_gpus_per_learner_worker=num_gpus,
     ).rollouts(
@@ -97,7 +96,7 @@ def create_quadx_waypoints_env(env_config):
                 "fcnet_activation": "linear",
                 "vf_share_layers": True,
             }
-        )  
+        )
         config.training(
             sgd_minibatch_size=128,
             train_batch_size=10000,

From 229a044605331836f234f8b4b8d5622dfc55251e Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Wed, 22 May 2024 15:00:24 -0600
Subject: [PATCH 11/16] clean up

Signed-off-by: peterghaddad <peter.g.haddad@lmco.com>
---
 rllib/examples/quadx_waypoints.py | 35 ++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 21a09be524bb..c3c3472b54b4 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -19,25 +19,31 @@
 
 import os
 
-from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.tune.registry import get_trainable_cls
 import gymnasium as gym
 from ray.rllib.utils.test_utils import (
     add_rllib_example_script_args,
     run_rllib_example_script_experiment,
 )
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+    TRAINING_ITERATION_TIMER
+)
 
 parser = add_rllib_example_script_args(
-    default_iters=200,
+    default_iters=2000,
     default_timesteps=100000,
     default_reward=90.0,
 )
 parser.add_argument(
     "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
 )
-parser.add_argument("--env-name", type=str, default="quadx_waypoints")
+parser.add_argument('--env-name', type=str, default="quadx_waypoints")
 parser.add_argument("--num-envs-per-worker", type=int, default=4)
-
+parser.add_argument("--use-prev-action", action="store_true")
+parser.add_argument("--use-prev-reward", action="store_true")
 
 class RewardWrapper(gym.RewardWrapper):
     def __init__(self, env):
@@ -62,8 +68,6 @@ def create_quadx_waypoints_env(env_config):
 
 
 if __name__ == "__main__":
-    import ray
-    from ray import air, tune
     from ray.tune.registry import register_env
 
     args = parser.parse_args()
@@ -74,7 +78,9 @@ def create_quadx_waypoints_env(env_config):
     algo_cls = get_trainable_cls(args.run)
     config = algo_cls.get_default_config()
 
-    config.environment(env=args.env_name).resources(
+    config.environment(
+        env=args.env_name
+    ).resources(
         num_learner_workers=num_gpus,
         num_gpus_per_learner_worker=num_gpus,
     ).rollouts(
@@ -92,11 +98,12 @@ def create_quadx_waypoints_env(env_config):
     if args.run == "PPO":
         config.rl_module(
             model_config_dict={
-                "fcnet_hiddens": [32],
-                "fcnet_activation": "linear",
-                "vf_share_layers": True,
+                "use_lstm": True,
+                "lstm_cell_size": 32,
+                "lstm_use_prev_action": args.use_prev_action,
+                "lstm_use_prev_reward": args.use_prev_reward,
             }
-        )
+        )  
         config.training(
             sgd_minibatch_size=128,
             train_batch_size=10000,
@@ -107,8 +114,8 @@ def create_quadx_waypoints_env(env_config):
         config.training(vf_loss_coeff=0.01)
 
     stop = {
-        "training_iteration": args.stop_iters,
-        "env_runners/episode_reward_mean": args.stop_reward,
+        TRAINING_ITERATION_TIMER: args.stop_iters,
+        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
     }
 
     run_rllib_example_script_experiment(
@@ -116,6 +123,6 @@ def create_quadx_waypoints_env(env_config):
         args,
         stop=stop,
         success_metric={
-            "env_runners/episode_reward_mean": args.stop_reward,
+            f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
         },
     )

From 532eed4fb09df6058cecba6ba9d4c4724b61e7ec Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Thu, 23 May 2024 07:39:25 -0600
Subject: [PATCH 12/16] Clean up

---
 rllib/examples/quadx_waypoints.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index c3c3472b54b4..5f896c1d33aa 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -28,22 +28,20 @@
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
-    EVALUATION_RESULTS,
-    TRAINING_ITERATION_TIMER
+    TRAINING_ITERATION_TIMER,
 )
 
 parser = add_rllib_example_script_args(
-    default_iters=2000,
+    default_iters=200,
     default_timesteps=100000,
     default_reward=90.0,
 )
 parser.add_argument(
     "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
 )
-parser.add_argument('--env-name', type=str, default="quadx_waypoints")
+parser.add_argument("--env-name", type=str, default="quadx_waypoints")
 parser.add_argument("--num-envs-per-worker", type=int, default=4)
-parser.add_argument("--use-prev-action", action="store_true")
-parser.add_argument("--use-prev-reward", action="store_true")
+
 
 class RewardWrapper(gym.RewardWrapper):
     def __init__(self, env):
@@ -78,9 +76,7 @@ def create_quadx_waypoints_env(env_config):
     algo_cls = get_trainable_cls(args.run)
     config = algo_cls.get_default_config()
 
-    config.environment(
-        env=args.env_name
-    ).resources(
+    config.environment(env=args.env_name).resources(
         num_learner_workers=num_gpus,
         num_gpus_per_learner_worker=num_gpus,
     ).rollouts(
@@ -98,12 +94,11 @@ def create_quadx_waypoints_env(env_config):
     if args.run == "PPO":
         config.rl_module(
             model_config_dict={
-                "use_lstm": True,
-                "lstm_cell_size": 32,
-                "lstm_use_prev_action": args.use_prev_action,
-                "lstm_use_prev_reward": args.use_prev_reward,
+                "fcnet_hiddens": [32],
+                "fcnet_activation": "linear",
+                "vf_share_layers": True,
             }
-        )  
+        )
         config.training(
             sgd_minibatch_size=128,
             train_batch_size=10000,
@@ -113,9 +108,11 @@ def create_quadx_waypoints_env(env_config):
         config.resources(num_gpus=0)
         config.training(vf_loss_coeff=0.01)
 
+    EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+
     stop = {
         TRAINING_ITERATION_TIMER: args.stop_iters,
-        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+        EPISODE_RETURN_MEAN_KEY: args.stop_reward,
     }
 
     run_rllib_example_script_experiment(
@@ -123,6 +120,6 @@ def create_quadx_waypoints_env(env_config):
         args,
         stop=stop,
         success_metric={
-            f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+            EPISODE_RETURN_MEAN_KEY: args.stop_reward,
         },
     )

From 6c0a7e94c0cdf733514aaa2d68755ca1e3e1248b Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 28 May 2024 06:44:43 -0600
Subject: [PATCH 13/16] Clean up

---
 rllib/examples/quadx_waypoints.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 5f896c1d33aa..45143e2ef115 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -28,7 +28,7 @@
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
-    TRAINING_ITERATION_TIMER,
+    TRAINING_ITERATION
 )
 
 parser = add_rllib_example_script_args(
@@ -111,8 +111,8 @@ def create_quadx_waypoints_env(env_config):
     EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
 
     stop = {
-        TRAINING_ITERATION_TIMER: args.stop_iters,
-        EPISODE_RETURN_MEAN_KEY: args.stop_reward,
+        TRAINING_ITERATION: args.stop_iters,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
     }
 
     run_rllib_example_script_experiment(
@@ -120,6 +120,6 @@ def create_quadx_waypoints_env(env_config):
         args,
         stop=stop,
         success_metric={
-            EPISODE_RETURN_MEAN_KEY: args.stop_reward,
+            f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
         },
     )

From 68793870841a5aef1eff568f22cbbb038758aa4e Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Tue, 28 May 2024 06:45:50 -0600
Subject: [PATCH 14/16] lint

Signed-off-by: peterghaddad <peter.g.haddad@lmco.com>
---
 rllib/examples/quadx_waypoints.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 45143e2ef115..3d64ac6b1a88 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -28,7 +28,7 @@
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
-    TRAINING_ITERATION
+    TRAINING_ITERATION,
 )
 
 parser = add_rllib_example_script_args(

From 98cd7c8ae94021dd3188b486e6d28a944acdf414 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Wed, 29 May 2024 11:09:33 -0600
Subject: [PATCH 15/16] remove training_iteration constant

---
 rllib/examples/quadx_waypoints.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index 3d64ac6b1a88..fa9b9b421932 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -28,7 +28,6 @@
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
-    TRAINING_ITERATION,
 )
 
 parser = add_rllib_example_script_args(
@@ -111,7 +110,7 @@ def create_quadx_waypoints_env(env_config):
     EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
 
     stop = {
-        TRAINING_ITERATION: args.stop_iters,
+        "training_iteration": args.stop_iters,
         f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
     }
 

From 769c9c85c33f411ebddd9f6801c7f3675ec6fbc1 Mon Sep 17 00:00:00 2001
From: peterghaddad <peter.g.haddad@lmco.com>
Date: Mon, 3 Jun 2024 05:46:47 -0600
Subject: [PATCH 16/16] clean up

Signed-off-by: peterghaddad <peter.g.haddad@lmco.com>
---
 rllib/examples/quadx_waypoints.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py
index fa9b9b421932..6148ede3a6f5 100644
--- a/rllib/examples/quadx_waypoints.py
+++ b/rllib/examples/quadx_waypoints.py
@@ -28,6 +28,7 @@
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
+    TRAINING_ITERATION_TIMER,
 )
 
 parser = add_rllib_example_script_args(
@@ -110,8 +111,8 @@ def create_quadx_waypoints_env(env_config):
     EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
 
     stop = {
-        "training_iteration": args.stop_iters,
-        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+        TRAINING_ITERATION_TIMER: args.stop_iters,
+        EPISODE_RETURN_MEAN_KEY: args.stop_reward,
     }
 
     run_rllib_example_script_experiment(