Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example containing a proposal for computing an adapted (time-dependent) GAE used by the PPO algorithm (via callback on_postprocess_trajectory) #20850

Merged
merged 3 commits into from
Dec 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/ray/train/examples/tensorflow_quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@ def train_func_distributed():
results = trainer.run(train_func_distributed)
trainer.shutdown()

# __tf_trainer_end__
# __tf_trainer_end__
2 changes: 1 addition & 1 deletion python/ray/train/examples/torch_quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,4 @@ def train_func_distributed():
results = trainer.run(train_func_distributed)
trainer.shutdown()

# __torch_trainer_end__
# __torch_trainer_end__
130 changes: 130 additions & 0 deletions rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
Adapted (time-dependent) GAE for PPO algorithm can be activated by setting
use_adapted_gae=True in the policy config. Additionally, it is required that
"callbacks" include the custom callback class in the Trainer's config.
Furthermore, the env must return in its info dictionary a key-value pair of
the form "d_ts": ... where the value is the length (time) of recent agent step.

This adapted, time-dependent computation of advantages may be useful in cases
where agent's actions take various times and thus time steps are not
equidistant (https://docdro.id/400TvlR)
"""

from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.utils.annotations import override
import numpy as np


class MyCallbacks(DefaultCallbacks):
@override(DefaultCallbacks)
def on_postprocess_trajectory(self, *, worker, episode, agent_id,
policy_id, policies, postprocessed_batch,
original_batches, **kwargs):
super().on_postprocess_trajectory(
worker=worker,
episode=episode,
agent_id=agent_id,
policy_id=policy_id,
policies=policies,
postprocessed_batch=postprocessed_batch,
original_batches=original_batches,
**kwargs)

if policies[policy_id].config.get("use_adapted_gae", False):
policy = policies[policy_id]
assert policy.config["use_gae"], \
"Can't use adapted gae without use_gae=True!"

info_dicts = postprocessed_batch[SampleBatch.INFOS]
assert np.all(["d_ts" in info_dict for info_dict in info_dicts]), \
"Info dicts in sample batch must contain data 'd_ts' \
(=ts[i+1]-ts[i] length of time steps)!"

d_ts = np.array(
[np.float(info_dict.get("d_ts")) for info_dict in info_dicts])
assert np.all([e.is_integer() for e in d_ts]), \
"Elements of 'd_ts' (length of time steps) must be integer!"

# Trajectory is actually complete -> last r=0.0.
if postprocessed_batch[SampleBatch.DONES][-1]:
last_r = 0.0
# Trajectory has been truncated -> last r=VF estimate of last obs.
else:
# Input dict is provided to us automatically via the Model's
# requirements. It's a single-timestep (last one in trajectory)
# input_dict.
# Create an input dict according to the Model's requirements.
input_dict = postprocessed_batch.get_single_step_input_dict(
policy.model.view_requirements, index="last")
last_r = policy._value(**input_dict)

gamma = policy.config["gamma"]
lambda_ = policy.config["lambda"]

vpred_t = np.concatenate([
postprocessed_batch[SampleBatch.VF_PREDS],
np.array([last_r])
])
delta_t = (postprocessed_batch[SampleBatch.REWARDS] +
gamma**d_ts * vpred_t[1:] - vpred_t[:-1])
# This formula for the advantage is an adaption of
# "Generalized Advantage Estimation"
# (https://arxiv.org/abs/1506.02438) which accounts for time steps
# of irregular length (see proposal here ).
# NOTE: last time step delta is not required
postprocessed_batch[Postprocessing.ADVANTAGES] = \
generalized_discount_cumsum(
delta_t, d_ts[:-1], gamma * lambda_)
postprocessed_batch[Postprocessing.VALUE_TARGETS] = (
postprocessed_batch[Postprocessing.ADVANTAGES] +
postprocessed_batch[SampleBatch.VF_PREDS]).astype(np.float32)

postprocessed_batch[Postprocessing.ADVANTAGES] = \
postprocessed_batch[Postprocessing.ADVANTAGES].astype(
np.float32)


def generalized_discount_cumsum(x: np.ndarray, deltas: np.ndarray,
gamma: float) -> np.ndarray:
"""Calculates the 'time-dependent' discounted cumulative sum over a
(reward) sequence `x`.

Recursive equations:

y[t] - gamma**deltas[t+1]*y[t+1] = x[t]

reversed(y)[t] - gamma**reversed(deltas)[t-1]*reversed(y)[t-1] =
reversed(x)[t]

Args:
x (np.ndarray): A sequence of rewards or one-step TD residuals.
deltas (np.ndarray): A sequence of time step deltas (length of time
steps).
gamma (float): The discount factor gamma.

Returns:
np.ndarray: The sequence containing the 'time-dependent' discounted
cumulative sums for each individual element in `x` till the end of
the trajectory.

Examples:
>>> x = np.array([0.0, 1.0, 2.0, 3.0])
>>> deltas = np.array([1.0, 4.0, 15.0])
>>> gamma = 0.9
>>> generalized_discount_cumsum(x, deltas, gamma)
... array([0.0 + 0.9^1.0*1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
... 1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
... 2.0 + 0.9^15.0*3.0,
... 3.0])
"""
reversed_x = x[::-1]
reversed_deltas = deltas[::-1]
reversed_y = np.empty_like(x)
reversed_y[0] = reversed_x[0]
for i in range(1, x.size):
reversed_y[i] = \
reversed_x[i] + gamma**reversed_deltas[i-1] * reversed_y[i-1]

return reversed_y[::-1]