Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Update autoregressive actions example. #47829

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 53 additions & 26 deletions rllib/examples/envs/classes/correlated_actions_env.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,71 @@
import gymnasium as gym
from gymnasium.spaces import Box, Discrete, Tuple
import numpy as np
import random


class CorrelatedActionsEnv(gym.Env):
"""
Simple env in which the policy has to emit a tuple of equal actions.
class AutoRegressiveActionEnv(gym.Env):
"""Custom Environment with autoregressive continuous actions.

Simple env in which the policy has to emit a tuple of correlated actions.

In each step, the agent observes a random number (0 or 1) and has to choose
two actions a1 and a2.
It gets +5 reward for matching a1 to the random obs and +5 for matching a2
to a1. I.e., +10 at most per step.
In each step, the agent observes a random number (between -1 and 1) and has
to choose two actions a1 and a2.

It gets 0 reward for matching a2 to the random obs times action a1. In all
simonsays1980 marked this conversation as resolved.
Show resolved Hide resolved
other cases the negative deviance between the desired action a2 and its
actual counterpart serves as reward. The reward is constructed in such a
way that actions need to be correlated to succeed. It is not possible
for the network to learn each action head separately.

One way to effectively learn this is through correlated action
distributions, e.g., in examples/rl_modules/autoregressive_action_rlm.py

There are 20 steps. Hence, the best score would be ~200 reward.
The game ends after the first step.
"""

def __init__(self, _=None):
self.observation_space = Box(0, 1, shape=(1,), dtype=np.float32)

# Define the action space (two continuous actions a1, a2)
self.action_space = Tuple([Discrete(2), Discrete(2)])
self.last_observation = None

def reset(self, *, seed=None, options=None):
self.t = 0
self.last_observation = np.array([random.choice([0, 1])], dtype=np.float32)
return self.last_observation, {}
# Define the observation space (state is a single continuous value)
self.observation_space = Box(low=-1, high=1, shape=(1,), dtype=np.float32)

# Internal state for the environment (e.g., could represent a factor
# influencing the relationship)
self.state = None

def reset(self, seed=None):
"""Reset the environment to an initial state."""
super().reset(seed=seed)

# Randomly initialize the state between -1 and 1
self.state = np.random.uniform(-1, 1, size=(1,))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice that this can be negative, too. Makes sense!


return self.state, {}

def step(self, action):
self.t += 1
"""Apply the autoregressive action and return step information."""

# Extract actions
a1, a2 = action
reward = 0
# Encourage correlation between most recent observation and a1.
if a1 == self.last_observation:
reward += 5
# Encourage correlation between a1 and a2.
if a1 == a2:
reward += 5
done = truncated = self.t > 20
self.last_observation = np.array([random.choice([0, 1])], dtype=np.float32)
return self.last_observation, reward, done, truncated, {}

# The state determines the desired relationship between a1 and a2
desired_a2 = (
self.state[0] * a1
) # Autoregressive relationship dependent on state

# Reward is based on how close a2 is to the state-dependent autoregressive
# relationship
reward = -np.abs(a2 - desired_a2) # Negative absolute error as the reward

# Optionally: add some noise or complexity to the reward function
# reward += np.random.normal(0, 0.01) # Small noise can be added

# Terminate after each step (no episode length in this simple example)
done = True

# Empty info dictionary
info = {}

return self.state, reward, done, False, info
10 changes: 6 additions & 4 deletions rllib/examples/rl_modules/autoregressive_actions_rl_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
- Uses this `RLModule` in a PPO training run on a simple environment
that rewards synchronized actions.
- Stops the training after 100k steps or when the mean episode return
exceeds 150 in evaluation, i.e. if the agent has learned to
exceeds 0.01 in evaluation, i.e. if the agent has learned to
synchronize its actions.

How to run this script
Expand Down Expand Up @@ -42,7 +42,9 @@
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.models.catalog import Catalog
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv
from ray.rllib.examples.envs.classes.correlated_actions_env import (
AutoRegressiveActionEnv,
)
from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import (
AutoregressiveActionsTorchRLM,
)
Expand All @@ -59,7 +61,7 @@
from ray.tune import register_env


register_env("correlated_actions_env", lambda _: CorrelatedActionsEnv(_))
register_env("correlated_actions_env", lambda _: AutoRegressiveActionEnv(_))

parser = add_rllib_example_script_args(
default_iters=200,
Expand Down Expand Up @@ -100,7 +102,7 @@
# exceeds 150 in evaluation.
stop = {
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 100000,
f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 150.0,
f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -0.012,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does it roughly start?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It roughly starts at around -0.55 - -0.6

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ray_tune_evaluation_env_runners_agent_episode_returns_mean_default_agent

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Niiice!!

}

# Run the example (with Tune).
Expand Down
31 changes: 6 additions & 25 deletions rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
import abc
from abc import abstractmethod
from typing import Any, Dict
from typing import Dict

from ray.rllib.core import Columns
from ray.rllib.core.models.base import ENCODER_OUT
from ray.rllib.core.models.configs import MLPHeadConfig
from ray.rllib.core.models.specs.specs_dict import SpecDict
from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI
from ray.rllib.core.rl_module.rl_module import RLModule
from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
from ray.rllib.utils.annotations import (
override,
OverrideToImplementCustomLogic_CallToSuperRecommended,
)
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.torch_utils import convert_to_torch_tensor
from ray.rllib.utils.typing import TensorType

torch, nn = try_import_torch()


# TODO (simon): Improvements: `inference-only` mode.
class AutoregressiveActionsRLM(RLModule):
class AutoregressiveActionsRLM(RLModule, ValueFunctionAPI, abc.ABC):
"""An RLModule that implements an autoregressive action distribution.

This RLModule implements an autoregressive action distribution, where the
Expand Down Expand Up @@ -124,22 +125,6 @@ def pi(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]:
A dict mapping Column names to batches of policy outputs.
"""

@abstractmethod
def _compute_values(self, batch) -> Any:
"""Computes values using the vf-specific network(s) and given a batch of data.

Args:
batch: The input batch to pass through this RLModule (value function
encoder and vf-head).

Returns:
A dict mapping ModuleIDs to batches of value function outputs (already
squeezed on the last dimension (which should have shape (1,) b/c of the
single value output node). However, for complex multi-agent settings with
shareed value networks, the output might look differently (e.g. a single
return batch without the ModuleID-based mapping).
"""


class AutoregressiveActionsTorchRLM(TorchRLModule, AutoregressiveActionsRLM):
@override(AutoregressiveActionsRLM)
Expand Down Expand Up @@ -261,12 +246,8 @@ def _forward_train(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]:

return outs

@override(AutoregressiveActionsRLM)
def _compute_values(self, batch, device=None):
infos = batch.pop(Columns.INFOS, None)
batch = convert_to_torch_tensor(batch, device=device)
if infos is not None:
batch[Columns.INFOS] = infos
@override(ValueFunctionAPI)
def compute_values(self, batch: Dict[str, TensorType]):

# Encoder forward pass.
encoder_outs = self.encoder(batch)[ENCODER_OUT]
Expand Down
Loading