From edfe921db4fd743fbb7b988670048019b20448c9 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Mon, 15 Apr 2024 20:07:32 -0600 Subject: [PATCH 01/16] initial --- .../classes/pyflyt_quadx_waypoints_env.py | 29 +++++++ rllib/examples/quadx_waypoints.py | 87 +++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py create mode 100644 rllib/examples/quadx_waypoints.py diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py new file mode 100644 index 000000000000..a7a34ffe2eab --- /dev/null +++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py @@ -0,0 +1,29 @@ +from ray.tune.registry import register_env +from gymnasium.wrappers import RecordVideo +from PyFlyt.gym_envs import FlattenWaypointEnv +from gymnasium.wrappers import TransformReward + +import gymnasium as gym +import PyFlyt.gym_envs # noqa + +class RewardWrapper(gym.RewardWrapper): + def __init__(self, env): + super().__init__(env) + def reward(self, reward): + # Scale rewards: + if reward >= 99.0 or reward <= -99.0: + return reward / 10 + return reward + +class QuadXWayPointsEnv(gym.Env): + from gymnasium.wrappers import RecordVideo + import PyFlyt.gym_envs # Must be here + from PyFlyt.gym_envs import FlattenWaypointEnv + from gymnasium.wrappers import TransformReward + + def __init__(self, config=None): + env = gym.make("PyFlyt/QuadX-Waypoints-v1") + # Wrap Environment to use max 10 and -10 for rewards + env = RewardWrapper(env) + + self.env = FlattenWaypointEnv(env, context_length=1) \ No newline at end of file diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py new file mode 100644 index 000000000000..69be9348d64b --- /dev/null +++ b/rllib/examples/quadx_waypoints.py @@ -0,0 +1,87 @@ +# TODO (sven): Move this example script into the new API stack. +# TODO (sven): Move this script to `examples/rl_modules/...` + +import argparse +import os + +from ray.rllib.examples.env.pyflyt_quadx_waypoints_env import QuadXWayPointsEnv +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import get_trainable_cls + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument("--use-prev-action", action="store_true") +parser.add_argument("--use-prev-reward", action="store_true") +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=500, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=90.0, help="Reward at which we stop training." +) + +if __name__ == "__main__": + import ray + from ray import air, tune + + args = parser.parse_args() + + ray.init() + + algo_cls = get_trainable_cls(args.run) + config = algo_cls.get_default_config() + + config.environment(env=QuadXWayPointsEnv).resources( + num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")) + ).framework(args.framework).reporting(min_time_s_per_iteration=0.1).training( + model={ + "use_lstm": True, + "lstm_cell_size": 32, + "lstm_use_prev_action": args.use_prev_action, + "lstm_use_prev_reward": args.use_prev_reward, + } + ) + + if args.run == "PPO": + config.training(num_sgd_iter=5, vf_loss_coeff=0.0001, train_batch_size=512) + config.model["vf_share_layers"] = True + elif args.run == "IMPALA": + config.rollouts(num_rollout_workers=2) + config.resources(num_gpus=0) + config.training(vf_loss_coeff=0.01) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + tuner = tune.Tuner( + args.run, + param_space=config.to_dict(), + run_config=air.RunConfig( + stop=stop, + ), + ) + results = tuner.fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() From 53aa9849df2e03406ce7c3902189a5af86ec701f Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 16 Apr 2024 08:54:12 -0600 Subject: [PATCH 02/16] Lint --- .../envs/classes/pyflyt_quadx_waypoints_env.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py index a7a34ffe2eab..c51b95d6e23c 100644 --- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py +++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py @@ -4,26 +4,29 @@ from gymnasium.wrappers import TransformReward import gymnasium as gym -import PyFlyt.gym_envs # noqa +import PyFlyt.gym_envs # noqa + class RewardWrapper(gym.RewardWrapper): def __init__(self, env): super().__init__(env) + def reward(self, reward): # Scale rewards: if reward >= 99.0 or reward <= -99.0: return reward / 10 return reward + class QuadXWayPointsEnv(gym.Env): from gymnasium.wrappers import RecordVideo - import PyFlyt.gym_envs # Must be here + import PyFlyt.gym_envs # Must be here from PyFlyt.gym_envs import FlattenWaypointEnv from gymnasium.wrappers import TransformReward - + def __init__(self, config=None): env = gym.make("PyFlyt/QuadX-Waypoints-v1") # Wrap Environment to use max 10 and -10 for rewards env = RewardWrapper(env) - - self.env = FlattenWaypointEnv(env, context_length=1) \ No newline at end of file + + self.env = FlattenWaypointEnv(env, context_length=1) From 4c13d0535e48df9e853cb87675eb396fff5cebc7 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 16 Apr 2024 17:51:36 -0600 Subject: [PATCH 03/16] lint --- .../envs/classes/pyflyt_quadx_waypoints_env.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py index c51b95d6e23c..3bec2fc6981a 100644 --- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py +++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py @@ -1,10 +1,5 @@ -from ray.tune.registry import register_env -from gymnasium.wrappers import RecordVideo from PyFlyt.gym_envs import FlattenWaypointEnv -from gymnasium.wrappers import TransformReward - import gymnasium as gym -import PyFlyt.gym_envs # noqa class RewardWrapper(gym.RewardWrapper): @@ -19,10 +14,7 @@ def reward(self, reward): class QuadXWayPointsEnv(gym.Env): - from gymnasium.wrappers import RecordVideo - import PyFlyt.gym_envs # Must be here - from PyFlyt.gym_envs import FlattenWaypointEnv - from gymnasium.wrappers import TransformReward + import PyFlyt.gym_envs # noqa def __init__(self, config=None): env = gym.make("PyFlyt/QuadX-Waypoints-v1") From bedb196e1cc3c7ec431249fb352f3af1a21f4437 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Wed, 17 Apr 2024 07:57:02 -0600 Subject: [PATCH 04/16] lint --- .../envs/classes/pyflyt_quadx_waypoints_env.py | 13 ++++++------- rllib/examples/quadx_waypoints.py | 14 ++++++++------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py index 3bec2fc6981a..f151ed3d1023 100644 --- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py +++ b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py @@ -1,4 +1,3 @@ -from PyFlyt.gym_envs import FlattenWaypointEnv import gymnasium as gym @@ -13,12 +12,12 @@ def reward(self, reward): return reward -class QuadXWayPointsEnv(gym.Env): +def create_quadx_waypoints_env(env_config): import PyFlyt.gym_envs # noqa + from PyFlyt.gym_envs import FlattenWaypointEnv - def __init__(self, config=None): - env = gym.make("PyFlyt/QuadX-Waypoints-v1") - # Wrap Environment to use max 10 and -10 for rewards - env = RewardWrapper(env) + env = gym.make("PyFlyt/QuadX-Waypoints-v1") + # Wrap Environment to use max 10 and -10 for rewards + env = RewardWrapper(env) - self.env = FlattenWaypointEnv(env, context_length=1) + return FlattenWaypointEnv(env, context_length=1) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 69be9348d64b..761b00459df9 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -1,10 +1,9 @@ -# TODO (sven): Move this example script into the new API stack. -# TODO (sven): Move this script to `examples/rl_modules/...` - import argparse import os -from ray.rllib.examples.env.pyflyt_quadx_waypoints_env import QuadXWayPointsEnv +from ray.rllib.examples.envs.classes.pyflyt_quadx_waypoints_env import ( + create_quadx_waypoints_env, +) from ray.rllib.utils.test_utils import check_learning_achieved from ray.tune.registry import get_trainable_cls @@ -12,7 +11,7 @@ parser.add_argument( "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." ) -parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument("--num-cpus", type=int, default=4) parser.add_argument( "--framework", choices=["tf", "tf2", "torch"], @@ -40,15 +39,18 @@ if __name__ == "__main__": import ray from ray import air, tune + from ray.tune.registry import register_env args = parser.parse_args() ray.init() + register_env("quadx_waypoints", create_quadx_waypoints_env) + algo_cls = get_trainable_cls(args.run) config = algo_cls.get_default_config() - config.environment(env=QuadXWayPointsEnv).resources( + config.environment(env="quadx_waypoints").resources( num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")) ).framework(args.framework).reporting(min_time_s_per_iteration=0.1).training( model={ From 7cfaea3336c422b9357d0ad6956484c0c6288cb4 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 14 May 2024 06:38:16 -0600 Subject: [PATCH 05/16] Remove file --- .../classes/pyflyt_quadx_waypoints_env.py | 23 --------------- rllib/examples/quadx_waypoints.py | 28 ++++++++++++++++--- 2 files changed, 24 insertions(+), 27 deletions(-) delete mode 100644 rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py diff --git a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py b/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py deleted file mode 100644 index f151ed3d1023..000000000000 --- a/rllib/examples/envs/classes/pyflyt_quadx_waypoints_env.py +++ /dev/null @@ -1,23 +0,0 @@ -import gymnasium as gym - - -class RewardWrapper(gym.RewardWrapper): - def __init__(self, env): - super().__init__(env) - - def reward(self, reward): - # Scale rewards: - if reward >= 99.0 or reward <= -99.0: - return reward / 10 - return reward - - -def create_quadx_waypoints_env(env_config): - import PyFlyt.gym_envs # noqa - from PyFlyt.gym_envs import FlattenWaypointEnv - - env = gym.make("PyFlyt/QuadX-Waypoints-v1") - # Wrap Environment to use max 10 and -10 for rewards - env = RewardWrapper(env) - - return FlattenWaypointEnv(env, context_length=1) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 761b00459df9..caafd6116edd 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -1,11 +1,9 @@ import argparse import os -from ray.rllib.examples.envs.classes.pyflyt_quadx_waypoints_env import ( - create_quadx_waypoints_env, -) from ray.rllib.utils.test_utils import check_learning_achieved from ray.tune.registry import get_trainable_cls +import gymnasium as gym parser = argparse.ArgumentParser() parser.add_argument( @@ -36,6 +34,28 @@ "--stop-reward", type=float, default=90.0, help="Reward at which we stop training." ) +class RewardWrapper(gym.RewardWrapper): + def __init__(self, env): + super().__init__(env) + + def reward(self, reward): + # Scale rewards: + if reward >= 99.0 or reward <= -99.0: + return reward / 10 + return reward + + +def create_quadx_waypoints_env(env_config): + import PyFlyt.gym_envs # noqa + from PyFlyt.gym_envs import FlattenWaypointEnv + + env = gym.make("PyFlyt/QuadX-Waypoints-v1") + # Wrap Environment to use max 10 and -10 for rewards + env = RewardWrapper(env) + + return FlattenWaypointEnv(env, context_length=1) + + if __name__ == "__main__": import ray from ray import air, tune @@ -45,7 +65,7 @@ ray.init() - register_env("quadx_waypoints", create_quadx_waypoints_env) + register_env("quadx_waypoints", env_creator=create_quadx_waypoints_env) algo_cls = get_trainable_cls(args.run) config = algo_cls.get_default_config() From 803c129cd715193a4d55e31862341d32838d11f7 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 14 May 2024 11:44:13 -0600 Subject: [PATCH 06/16] Address feedback --- rllib/examples/quadx_waypoints.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index caafd6116edd..e0857ef2927d 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -12,7 +12,7 @@ parser.add_argument("--num-cpus", type=int, default=4) parser.add_argument( "--framework", - choices=["tf", "tf2", "torch"], + choices=["tf2", "torch"], default="torch", help="The DL framework specifier.", ) @@ -70,15 +70,24 @@ def create_quadx_waypoints_env(env_config): algo_cls = get_trainable_cls(args.run) config = algo_cls.get_default_config() - config.environment(env="quadx_waypoints").resources( + config.environment( + env="quadx_waypoints" + ).resources( num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")) - ).framework(args.framework).reporting(min_time_s_per_iteration=0.1).training( - model={ + ).framework( + args.framework + ).api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ).rl_module( + model_config_dict={ "use_lstm": True, "lstm_cell_size": 32, "lstm_use_prev_action": args.use_prev_action, "lstm_use_prev_reward": args.use_prev_reward, } + ).reporting( + min_time_s_per_iteration=0.1 ) if args.run == "PPO": @@ -92,7 +101,7 @@ def create_quadx_waypoints_env(env_config): stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, + "episode_return_mean": args.stop_reward, } tuner = tune.Tuner( From 91e6f4fd2adc5bea30f5243f4f53dae6c4c2a32a Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 14 May 2024 15:21:07 -0600 Subject: [PATCH 07/16] Address feedback --- rllib/examples/quadx_waypoints.py | 44 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index e0857ef2927d..0152812e1760 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -9,27 +9,19 @@ parser.add_argument( "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." ) +parser.add_argument('--env-name', type=str, default="quadx_waypoints") parser.add_argument("--num-cpus", type=int, default=4) +parser.add_argument("--num-envs-per-worker", type=int, default=4) parser.add_argument( "--framework", choices=["tf2", "torch"], default="torch", help="The DL framework specifier.", ) -parser.add_argument("--use-prev-action", action="store_true") -parser.add_argument("--use-prev-reward", action="store_true") -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) + parser.add_argument( "--stop-iters", type=int, default=500, help="Number of iterations to train." ) -parser.add_argument( - "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train." -) parser.add_argument( "--stop-reward", type=float, default=90.0, help="Reward at which we stop training." ) @@ -65,34 +57,39 @@ def create_quadx_waypoints_env(env_config): ray.init() - register_env("quadx_waypoints", env_creator=create_quadx_waypoints_env) + register_env(args.env_name, env_creator=create_quadx_waypoints_env) algo_cls = get_trainable_cls(args.run) config = algo_cls.get_default_config() config.environment( - env="quadx_waypoints" + env=args.env_name ).resources( num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")) + ).rollouts( + num_rollout_workers=args.num_cpus, + num_envs_per_worker=args.num_envs_per_worker, ).framework( args.framework ).api_stack( enable_rl_module_and_learner=True, enable_env_runner_and_connector_v2=True, - ).rl_module( - model_config_dict={ - "use_lstm": True, - "lstm_cell_size": 32, - "lstm_use_prev_action": args.use_prev_action, - "lstm_use_prev_reward": args.use_prev_reward, - } ).reporting( min_time_s_per_iteration=0.1 ) if args.run == "PPO": - config.training(num_sgd_iter=5, vf_loss_coeff=0.0001, train_batch_size=512) - config.model["vf_share_layers"] = True + config.rl_module( + model_config_dict={ + "fcnet_hiddens": [32], + "fcnet_activation": "linear", + "vf_share_layers": True, + } + ) + config.training( + sgd_minibatch_size=128, + train_batch_size=10000, + ) elif args.run == "IMPALA": config.rollouts(num_rollout_workers=2) config.resources(num_gpus=0) @@ -100,8 +97,7 @@ def create_quadx_waypoints_env(env_config): stop = { "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_return_mean": args.stop_reward, + "env_runner_results/episode_return_mean": args.stop_reward, } tuner = tune.Tuner( From 54e9c8b65450287079d0e31678e4422b83f7c01a Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 14 May 2024 16:26:11 -0600 Subject: [PATCH 08/16] Clean up --- rllib/examples/quadx_waypoints.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 0152812e1760..bf15d89aea63 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -19,6 +19,8 @@ help="The DL framework specifier.", ) +parser.add_argument("--as-test", type=bool, default=True) + parser.add_argument( "--stop-iters", type=int, default=500, help="Number of iterations to train." ) @@ -54,6 +56,7 @@ def create_quadx_waypoints_env(env_config): from ray.tune.registry import register_env args = parser.parse_args() + num_gpus = int(os.environ.get("RLLIB_NUM_GPUS", "0")) ray.init() @@ -65,15 +68,16 @@ def create_quadx_waypoints_env(env_config): config.environment( env=args.env_name ).resources( - num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")) + num_learner_workers=num_gpus, + num_gpus_per_learner_worker=num_gpus, ).rollouts( num_rollout_workers=args.num_cpus, num_envs_per_worker=args.num_envs_per_worker, ).framework( args.framework ).api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ).reporting( min_time_s_per_iteration=0.1 ) From 0bcdb2e95db0bd5ae35df38c3e4beacf88a3c1b6 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Wed, 22 May 2024 10:10:24 -0600 Subject: [PATCH 09/16] feedback --- rllib/examples/quadx_waypoints.py | 68 ++++++++++++++++--------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index bf15d89aea63..5e92a842e5ad 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -1,32 +1,42 @@ -import argparse +"""Example using the PyFlyt Gymnasium environment to train a UAV to reach waypoints. + +PyFlyt GitHub Repository: https://github.com/jjshoots/PyFlyt/tree/master/PyFlyt + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" + import os from ray.rllib.utils.test_utils import check_learning_achieved from ray.tune.registry import get_trainable_cls import gymnasium as gym +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) -parser = argparse.ArgumentParser() +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=90.0, +) parser.add_argument( "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." ) parser.add_argument('--env-name', type=str, default="quadx_waypoints") -parser.add_argument("--num-cpus", type=int, default=4) parser.add_argument("--num-envs-per-worker", type=int, default=4) -parser.add_argument( - "--framework", - choices=["tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) - -parser.add_argument("--as-test", type=bool, default=True) - -parser.add_argument( - "--stop-iters", type=int, default=500, help="Number of iterations to train." -) -parser.add_argument( - "--stop-reward", type=float, default=90.0, help="Reward at which we stop training." -) class RewardWrapper(gym.RewardWrapper): def __init__(self, env): @@ -58,8 +68,6 @@ def create_quadx_waypoints_env(env_config): args = parser.parse_args() num_gpus = int(os.environ.get("RLLIB_NUM_GPUS", "0")) - ray.init() - register_env(args.env_name, env_creator=create_quadx_waypoints_env) algo_cls = get_trainable_cls(args.run) @@ -101,18 +109,14 @@ def create_quadx_waypoints_env(env_config): stop = { "training_iteration": args.stop_iters, - "env_runner_results/episode_return_mean": args.stop_reward, + "env_runners/episode_reward_mean": args.stop_reward, } - tuner = tune.Tuner( - args.run, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop=stop, - ), + run_rllib_example_script_experiment( + config, + args, + stop=stop, + success_metric={ + "env_runners/episode_reward_mean": args.stop_reward, + }, ) - results = tuner.fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() From 2d4cc64a4d7f6c3313693332ab04125fc4a76aad Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Wed, 22 May 2024 10:32:57 -0600 Subject: [PATCH 10/16] lint Signed-off-by: peterghaddad --- rllib/examples/quadx_waypoints.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 5e92a842e5ad..21a09be524bb 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -35,9 +35,10 @@ parser.add_argument( "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." ) -parser.add_argument('--env-name', type=str, default="quadx_waypoints") +parser.add_argument("--env-name", type=str, default="quadx_waypoints") parser.add_argument("--num-envs-per-worker", type=int, default=4) + class RewardWrapper(gym.RewardWrapper): def __init__(self, env): super().__init__(env) @@ -73,9 +74,7 @@ def create_quadx_waypoints_env(env_config): algo_cls = get_trainable_cls(args.run) config = algo_cls.get_default_config() - config.environment( - env=args.env_name - ).resources( + config.environment(env=args.env_name).resources( num_learner_workers=num_gpus, num_gpus_per_learner_worker=num_gpus, ).rollouts( @@ -97,7 +96,7 @@ def create_quadx_waypoints_env(env_config): "fcnet_activation": "linear", "vf_share_layers": True, } - ) + ) config.training( sgd_minibatch_size=128, train_batch_size=10000, From 229a044605331836f234f8b4b8d5622dfc55251e Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Wed, 22 May 2024 15:00:24 -0600 Subject: [PATCH 11/16] clean up Signed-off-by: peterghaddad --- rllib/examples/quadx_waypoints.py | 35 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 21a09be524bb..c3c3472b54b4 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -19,25 +19,31 @@ import os -from ray.rllib.utils.test_utils import check_learning_achieved from ray.tune.registry import get_trainable_cls import gymnasium as gym from ray.rllib.utils.test_utils import ( add_rllib_example_script_args, run_rllib_example_script_experiment, ) +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, + TRAINING_ITERATION_TIMER +) parser = add_rllib_example_script_args( - default_iters=200, + default_iters=2000, default_timesteps=100000, default_reward=90.0, ) parser.add_argument( "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." ) -parser.add_argument("--env-name", type=str, default="quadx_waypoints") +parser.add_argument('--env-name', type=str, default="quadx_waypoints") parser.add_argument("--num-envs-per-worker", type=int, default=4) - +parser.add_argument("--use-prev-action", action="store_true") +parser.add_argument("--use-prev-reward", action="store_true") class RewardWrapper(gym.RewardWrapper): def __init__(self, env): @@ -62,8 +68,6 @@ def create_quadx_waypoints_env(env_config): if __name__ == "__main__": - import ray - from ray import air, tune from ray.tune.registry import register_env args = parser.parse_args() @@ -74,7 +78,9 @@ def create_quadx_waypoints_env(env_config): algo_cls = get_trainable_cls(args.run) config = algo_cls.get_default_config() - config.environment(env=args.env_name).resources( + config.environment( + env=args.env_name + ).resources( num_learner_workers=num_gpus, num_gpus_per_learner_worker=num_gpus, ).rollouts( @@ -92,11 +98,12 @@ def create_quadx_waypoints_env(env_config): if args.run == "PPO": config.rl_module( model_config_dict={ - "fcnet_hiddens": [32], - "fcnet_activation": "linear", - "vf_share_layers": True, + "use_lstm": True, + "lstm_cell_size": 32, + "lstm_use_prev_action": args.use_prev_action, + "lstm_use_prev_reward": args.use_prev_reward, } - ) + ) config.training( sgd_minibatch_size=128, train_batch_size=10000, @@ -107,8 +114,8 @@ def create_quadx_waypoints_env(env_config): config.training(vf_loss_coeff=0.01) stop = { - "training_iteration": args.stop_iters, - "env_runners/episode_reward_mean": args.stop_reward, + TRAINING_ITERATION_TIMER: args.stop_iters, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, } run_rllib_example_script_experiment( @@ -116,6 +123,6 @@ def create_quadx_waypoints_env(env_config): args, stop=stop, success_metric={ - "env_runners/episode_reward_mean": args.stop_reward, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, }, ) From 532eed4fb09df6058cecba6ba9d4c4724b61e7ec Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Thu, 23 May 2024 07:39:25 -0600 Subject: [PATCH 12/16] Clean up --- rllib/examples/quadx_waypoints.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index c3c3472b54b4..5f896c1d33aa 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -28,22 +28,20 @@ from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, - EVALUATION_RESULTS, - TRAINING_ITERATION_TIMER + TRAINING_ITERATION_TIMER, ) parser = add_rllib_example_script_args( - default_iters=2000, + default_iters=200, default_timesteps=100000, default_reward=90.0, ) parser.add_argument( "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." ) -parser.add_argument('--env-name', type=str, default="quadx_waypoints") +parser.add_argument("--env-name", type=str, default="quadx_waypoints") parser.add_argument("--num-envs-per-worker", type=int, default=4) -parser.add_argument("--use-prev-action", action="store_true") -parser.add_argument("--use-prev-reward", action="store_true") + class RewardWrapper(gym.RewardWrapper): def __init__(self, env): @@ -78,9 +76,7 @@ def create_quadx_waypoints_env(env_config): algo_cls = get_trainable_cls(args.run) config = algo_cls.get_default_config() - config.environment( - env=args.env_name - ).resources( + config.environment(env=args.env_name).resources( num_learner_workers=num_gpus, num_gpus_per_learner_worker=num_gpus, ).rollouts( @@ -98,12 +94,11 @@ def create_quadx_waypoints_env(env_config): if args.run == "PPO": config.rl_module( model_config_dict={ - "use_lstm": True, - "lstm_cell_size": 32, - "lstm_use_prev_action": args.use_prev_action, - "lstm_use_prev_reward": args.use_prev_reward, + "fcnet_hiddens": [32], + "fcnet_activation": "linear", + "vf_share_layers": True, } - ) + ) config.training( sgd_minibatch_size=128, train_batch_size=10000, @@ -113,9 +108,11 @@ def create_quadx_waypoints_env(env_config): config.resources(num_gpus=0) config.training(vf_loss_coeff=0.01) + EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + stop = { TRAINING_ITERATION_TIMER: args.stop_iters, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + EPISODE_RETURN_MEAN_KEY: args.stop_reward, } run_rllib_example_script_experiment( @@ -123,6 +120,6 @@ def create_quadx_waypoints_env(env_config): args, stop=stop, success_metric={ - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + EPISODE_RETURN_MEAN_KEY: args.stop_reward, }, ) From 6c0a7e94c0cdf733514aaa2d68755ca1e3e1248b Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 28 May 2024 06:44:43 -0600 Subject: [PATCH 13/16] Clean up --- rllib/examples/quadx_waypoints.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 5f896c1d33aa..45143e2ef115 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -28,7 +28,7 @@ from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, - TRAINING_ITERATION_TIMER, + TRAINING_ITERATION ) parser = add_rllib_example_script_args( @@ -111,8 +111,8 @@ def create_quadx_waypoints_env(env_config): EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" stop = { - TRAINING_ITERATION_TIMER: args.stop_iters, - EPISODE_RETURN_MEAN_KEY: args.stop_reward, + TRAINING_ITERATION: args.stop_iters, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, } run_rllib_example_script_experiment( @@ -120,6 +120,6 @@ def create_quadx_waypoints_env(env_config): args, stop=stop, success_metric={ - EPISODE_RETURN_MEAN_KEY: args.stop_reward, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, }, ) From 68793870841a5aef1eff568f22cbbb038758aa4e Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Tue, 28 May 2024 06:45:50 -0600 Subject: [PATCH 14/16] lint Signed-off-by: peterghaddad --- rllib/examples/quadx_waypoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 45143e2ef115..3d64ac6b1a88 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -28,7 +28,7 @@ from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, - TRAINING_ITERATION + TRAINING_ITERATION, ) parser = add_rllib_example_script_args( From 98cd7c8ae94021dd3188b486e6d28a944acdf414 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Wed, 29 May 2024 11:09:33 -0600 Subject: [PATCH 15/16] remove training_iteration constant --- rllib/examples/quadx_waypoints.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index 3d64ac6b1a88..fa9b9b421932 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -28,7 +28,6 @@ from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, - TRAINING_ITERATION, ) parser = add_rllib_example_script_args( @@ -111,7 +110,7 @@ def create_quadx_waypoints_env(env_config): EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" stop = { - TRAINING_ITERATION: args.stop_iters, + "training_iteration": args.stop_iters, f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, } From 769c9c85c33f411ebddd9f6801c7f3675ec6fbc1 Mon Sep 17 00:00:00 2001 From: peterghaddad Date: Mon, 3 Jun 2024 05:46:47 -0600 Subject: [PATCH 16/16] clean up Signed-off-by: peterghaddad --- rllib/examples/quadx_waypoints.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rllib/examples/quadx_waypoints.py b/rllib/examples/quadx_waypoints.py index fa9b9b421932..6148ede3a6f5 100644 --- a/rllib/examples/quadx_waypoints.py +++ b/rllib/examples/quadx_waypoints.py @@ -28,6 +28,7 @@ from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, + TRAINING_ITERATION_TIMER, ) parser = add_rllib_example_script_args( @@ -110,8 +111,8 @@ def create_quadx_waypoints_env(env_config): EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" stop = { - "training_iteration": args.stop_iters, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + TRAINING_ITERATION_TIMER: args.stop_iters, + EPISODE_RETURN_MEAN_KEY: args.stop_reward, } run_rllib_example_script_experiment(