Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Cleanup examples folder 22: Add 2 (count-based) curiosity examples. #46737

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2536,6 +2536,26 @@ py_test(
# args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=-600.0", "--framework=torch", "--algo=IMPALA", "--num-env-runners=5", "--num-cpus=6"]
# )

# subdirectory: curiosity/
# ....................................
py_test(
name = "examples/curiosity/count_based_curiosity",
main = "examples/curiosity/count_based_curiosity.py",
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/curiosity/count_based_curiosity.py"],
args = ["--enable-new-api-stack", "--as-test"]
)

py_test(
name = "examples/curiosity/euclidian_distance_based_curiosity",
main = "examples/curiosity/euclidian_distance_based_curiosity.py",
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/curiosity/euclidian_distance_based_curiosity.py"],
args = ["--enable-new-api-stack", "--as-test"]
)

# subdirectory: curriculum/
# ....................................
py_test(
Expand Down
7 changes: 7 additions & 0 deletions rllib/connectors/connector_pipeline_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ def __call__(
shared_data=shared_data,
**kwargs,
)
if not isinstance(data, dict):
raise ValueError(
f"`data` returned by ConnectorV2 {connector} must be a dict! "
f"You returned {data}. Check your (custom) connectors' "
f"`__call__()` method's return value and make sure you return "
f"the `data` arg passed in (either altered or unchanged)."
)
return data

def remove(self, name_or_class: Union[str, Type]):
Expand Down
Empty file.
92 changes: 92 additions & 0 deletions rllib/examples/connectors/classes/count_based_curiosity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from collections import Counter
from typing import Any, List, Optional

import gymnasium as gym

from ray.rllib.connectors.connector_v2 import ConnectorV2
from ray.rllib.core.rl_module.rl_module import RLModule
from ray.rllib.utils.typing import EpisodeType


class CountBasedCuriosity(ConnectorV2):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! Great example!

"""Learner ConnectorV2 piece to compute intrinsic rewards based on obs counts.

Add this connector piece to your Learner pipeline, through your algo config:
```
config.training(
learner_connector=lambda obs_sp, act_sp: CountBasedCuriosity()
)
```

Intrinsic rewards are computed on the Learner side based on naive observation
counts, which is why this connector should only be used for simple environments
with a reasonable number of possible observations. The intrinsic reward for a given
timestep is:
r(i) = intrinsic_reward_coeff * (1 / C(obs(i)))
where C is the total (lifetime) count of the obs at timestep i.

The intrinsic reward is added to the extrinsic reward and saved back into the
episode (under the main "rewards" key).

Note that the computation and saving back to the episode all happens before the
actual train batch is generated from the episode data. Thus, the Learner and the
RLModule used do not take notice of the extra reward added.

If you would like to use a more sophisticated mechanism for intrinsic reward
computations, take a look at the `EuclidianDistanceBasedCuriosity` connector piece
at `ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity`
"""

def __init__(
self,
input_observation_space: Optional[gym.Space] = None,
input_action_space: Optional[gym.Space] = None,
*,
intrinsic_reward_coeff: float = 1.0,
**kwargs,
):
"""Initializes a CountBasedCuriosity instance.

Args:
intrinsic_reward_coeff: The weight with which to multiply the intrinsic
reward before adding (and saving) it back to the main (extrinsic)
reward of the episode at each timestep.
"""
super().__init__(input_observation_space, input_action_space)

# Naive observation counter.
self._counts = Counter()
self.intrinsic_reward_coeff = intrinsic_reward_coeff

def __call__(
self,
*,
rl_module: RLModule,
data: Any,
episodes: List[EpisodeType],
explore: Optional[bool] = None,
shared_data: Optional[dict] = None,
**kwargs,
) -> Any:
# Loop through all episodes and change the reward to
# [reward + intrinsic reward]
for sa_episode in self.single_agent_episode_iterator(
episodes=episodes, agents_that_stepped_only=False
):
# Loop through all obs, except the last one.
observations = sa_episode.get_observations(slice(None, -1))
# Get all respective (extrinsic) rewards.
rewards = sa_episode.get_rewards()

for i, (obs, rew) in enumerate(zip(observations, rewards)):
obs = tuple(obs)
# Add 1 to obs counter.
self._counts[obs] += 1
# Compute our count-based intrinsic reward and add it to the main
# (extrinsic) reward.
rew += self.intrinsic_reward_coeff * (1 / self._counts[obs])
# Store the new reward back to the episode (under the correct
# timestep/index).
sa_episode.set_rewards(new_data=rew, at_indices=i)

return data
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from collections import deque
from typing import Any, List, Optional

import gymnasium as gym
import numpy as np

from ray.rllib.connectors.connector_v2 import ConnectorV2
from ray.rllib.core.rl_module.rl_module import RLModule
from ray.rllib.utils.typing import EpisodeType


class EuclidianDistanceBasedCuriosity(ConnectorV2):
"""Learner ConnectorV2 piece computing intrinsic rewards with euclidian distance.

Add this connector piece to your Learner pipeline, through your algo config:
```
config.training(
learner_connector=lambda obs_sp, act_sp: EuclidianDistanceBasedCuriosity()
)
```

Intrinsic rewards are computed on the Learner side based on comparing the euclidian
distance of observations vs already seen ones. A configurable number of observations
will be stored in a FIFO buffer and all incoming observations have their distance
measured against those.

The minimum distance measured is the intrinsic reward for the incoming obs
(multiplied by a fixed coeffieicnt and added to the "main" extrinsic reward):
r(i) = intrinsic_reward_coeff * min(ED(o, o(i)) for o in stored_obs))
where `ED` is the euclidian distance and `stored_obs` is the buffer.

The intrinsic reward is then added to the extrinsic reward and saved back into the
episode (under the main "rewards" key).

Note that the computation and saving back to the episode all happens before the
actual train batch is generated from the episode data. Thus, the Learner and the
RLModule used do not take notice of the extra reward added.

Only one observation per incoming episode will be stored as a new one in the buffer.
Thereby, we pick the observation with the largest `min(ED)` value over all already
stored observations to be stored per episode.

If you would like to use a simpler, count-based mechanism for intrinsic reward
computations, take a look at the `CountBasedCuriosity` connector piece
at `ray.rllib.examples.connectors.classes.count_based_curiosity`
"""

def __init__(
self,
input_observation_space: Optional[gym.Space] = None,
input_action_space: Optional[gym.Space] = None,
*,
intrinsic_reward_coeff: float = 1.0,
max_buffer_size: int = 100,
**kwargs,
):
"""Initializes a CountBasedCuriosity instance.

Args:
intrinsic_reward_coeff: The weight with which to multiply the intrinsic
reward before adding (and saving) it back to the main (extrinsic)
reward of the episode at each timestep.
"""
super().__init__(input_observation_space, input_action_space)

# Create an observation buffer
self.obs_buffer = deque(maxlen=max_buffer_size)
self.intrinsic_reward_coeff = intrinsic_reward_coeff

self._test = 0

def __call__(
self,
*,
rl_module: RLModule,
data: Any,
episodes: List[EpisodeType],
explore: Optional[bool] = None,
shared_data: Optional[dict] = None,
**kwargs,
) -> Any:
if self._test > 10:
return data
self._test += 1
# Loop through all episodes and change the reward to
# [reward + intrinsic reward]
for sa_episode in self.single_agent_episode_iterator(
episodes=episodes, agents_that_stepped_only=False
):
# Loop through all obs, except the last one.
observations = sa_episode.get_observations(slice(None, -1))
# Get all respective (extrinsic) rewards.
rewards = sa_episode.get_rewards()

max_dist_obs = None
max_dist = float("-inf")
for i, (obs, rew) in enumerate(zip(observations, rewards)):
# Compare obs to all stored observations and compute euclidian distance.
min_dist = 0.0
if self.obs_buffer:
min_dist = min(
np.sqrt(np.sum((obs - stored_obs) ** 2))
for stored_obs in self.obs_buffer
)
if min_dist > max_dist:
max_dist = min_dist
max_dist_obs = obs

# Compute our euclidian distance-based intrinsic reward and add it to
# the main (extrinsic) reward.
rew += self.intrinsic_reward_coeff * min_dist
# Store the new reward back to the episode (under the correct
# timestep/index).
sa_episode.set_rewards(new_data=rew, at_indices=i)

# Add the one observation of this episode with the largest (min) euclidian
# dist to all already stored obs to the buffer (maybe throwing out the
# oldest obs in there).
if max_dist_obs is not None:
self.obs_buffer.append(max_dist_obs)

return data
14 changes: 14 additions & 0 deletions rllib/examples/connectors/count_based_curiosity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Placeholder for training with count-based curiosity.

The actual script can be found at a different location (see code below).
"""

if __name__ == "__main__":
import subprocess
import sys

# Forward to "python ../curiosity/[same script name].py [same options]"
command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]

# Run the script.
subprocess.run(command, capture_output=True)
14 changes: 14 additions & 0 deletions rllib/examples/connectors/euclidian_distance_based_curiosity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Placeholder for training with euclidian distance-based curiosity.

The actual script can be found at a different location (see code below).
"""

if __name__ == "__main__":
import subprocess
import sys

# Forward to "python ../curiosity/[same script name].py [same options]"
command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]

# Run the script.
subprocess.run(command, capture_output=True)
Empty file.
Loading
Loading