Skip to content

Commit

Permalink
[RLlib] Example Cleanups new API Stack - Autoregressive Action Module. (
Browse files Browse the repository at this point in the history
  • Loading branch information
simonsays1980 authored May 24, 2024
1 parent 5cb7c09 commit fde5203
Show file tree
Hide file tree
Showing 4 changed files with 416 additions and 8 deletions.
13 changes: 7 additions & 6 deletions rllib/examples/envs/classes/correlated_actions_env.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gymnasium as gym
from gymnasium.spaces import Discrete, Tuple
from gymnasium.spaces import Box, Discrete, Tuple
import numpy as np
import random


Expand All @@ -13,19 +14,19 @@ class CorrelatedActionsEnv(gym.Env):
to a1. I.e., +10 at most per step.
One way to effectively learn this is through correlated action
distributions, e.g., in examples/autoregressive_action_dist.py
distributions, e.g., in examples/rl_modules/autoregressive_action_rlm.py
There are 20 steps. Hence, the best score would be ~200 reward.
"""

def __init__(self, _):
self.observation_space = Discrete(2)
def __init__(self, _=None):
self.observation_space = Box(0, 1, shape=(1,), dtype=np.float32)
self.action_space = Tuple([Discrete(2), Discrete(2)])
self.last_observation = None

def reset(self, *, seed=None, options=None):
self.t = 0
self.last_observation = random.choice([0, 1])
self.last_observation = np.array([random.choice([0, 1])], dtype=np.float32)
return self.last_observation, {}

def step(self, action):
Expand All @@ -39,5 +40,5 @@ def step(self, action):
if a1 == a2:
reward += 5
done = truncated = self.t > 20
self.last_observation = random.choice([0, 1])
self.last_observation = np.array([random.choice([0, 1])], dtype=np.float32)
return self.last_observation, reward, done, truncated, {}
107 changes: 107 additions & 0 deletions rllib/examples/rl_modules/autoregressive_actions_rlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""An example script showing how to define and load an `RLModule` with
a dependent action space.
This examples:
- Defines an `RLModule` with autoregressive actions.
- It does so by implementing a prior distribution for the first couple
of actions and then using these actions in a posterior distribution.
- Furthermore, it uses in the `RLModule` our simple base `Catalog` class
to build the distributions.
- Uses this `RLModule` in a PPO training run on a simple environment
that rewards synchronized actions.
- Stops the training after 100k steps or when the mean episode return
exceeds 150 in evaluation, i.e. if the agent has learned to
synchronize its actions.
How to run this script
----------------------
`python [script file name].py --enable-new-api-stack --num-env-runners 2`
Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
will increase the sampling speed.
For debugging, use the following additional command line options
`--no-tune --num-env-runners=0`
which should allow you to set breakpoints anywhere in the RLlib code and
have the execution stop there for inspection and debugging.
For logging to your WandB account, use:
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
--wandb-run-name=[optional: WandB run name (within the defined project)]`
Results to expect
-----------------
You should expect a reward of around 155-160 after ~36,000 timesteps sampled
(trained) being achieved by a simple PPO policy (no tuning, just using RLlib's
default settings). For details take also a closer look into the
`CorrelatedActionsEnv` environment. Rewards are such that to receive a return
over 100, the agent must learn to synchronize its actions.
"""


from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.models.catalog import Catalog
from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv
from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import (
AutoregressiveActionTorchRLM,
)
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
EVALUATION_RESULTS,
NUM_ENV_STEPS_SAMPLED_LIFETIME,
)
from ray.rllib.utils.test_utils import (
add_rllib_example_script_args,
run_rllib_example_script_experiment,
)
from ray.tune import register_env


register_env("correlated_actions_env", lambda _: CorrelatedActionsEnv(_))

parser = add_rllib_example_script_args(
default_iters=200,
default_timesteps=100000,
default_reward=150.0,
)

if __name__ == "__main__":
args = parser.parse_args()

if args.algo != "PPO":
raise ValueError("This example only supports PPO. Please use --algo=PPO.")

base_config = (
PPOConfig()
.environment(env="correlated_actions_env")
.rl_module(
model_config_dict={
"post_fcnet_hiddens": [64, 64],
"post_fcnet_activation": "relu",
},
# We need to explicitly specify here RLModule to use and
# the catalog needed to build it.
rl_module_spec=SingleAgentRLModuleSpec(
module_class=AutoregressiveActionTorchRLM,
catalog_class=Catalog,
),
)
.evaluation(
evaluation_num_env_runners=1,
evaluation_interval=1,
# Run evaluation parallel to training to speed up the example.
evaluation_parallel_to_training=True,
)
)

# Let's stop the training after 100k steps or when the mean episode return
# exceeds 150 in evaluation.
stop = {
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 100000,
f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 150.0,
}

# Run the example (with Tune).
run_rllib_example_script_experiment(base_config, args, stop=stop)
Loading

0 comments on commit fde5203

Please sign in to comment.