Skip to content

Commit

Permalink
[RLlib] Always attach latest eval metrics. (#21011)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jun Gong authored Dec 15, 2021
1 parent 1c1430f commit 767f78e
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 110 deletions.
102 changes: 51 additions & 51 deletions release/rllib_tests/learning_tests/hard_learning_tests.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

a2c-breakoutnoframeskip-v4:
env: BreakoutNoFrameskip-v4
run: A2C
Expand Down Expand Up @@ -163,56 +162,57 @@ appo-pongnoframeskip-v4:
# evaluation_config:
# input: sampler

# cql-halfcheetahbulletenv-v0:
# env: HalfCheetahBulletEnv-v0
# run: CQL
# pass_criteria:
# episode_reward_mean: 400.0
# timesteps_total: 10000000
# stop:
# time_total_s: 3600
# config:
# # Use input produced by expert SAC algo.
# input: ["~/halfcheetah_expert_sac.zip"]
# actions_in_input_normalized: true
#
# soft_horizon: False
# horizon: 1000
# Q_model:
# fcnet_activation: relu
# fcnet_hiddens: [256, 256, 256]
# policy_model:
# fcnet_activation: relu
# fcnet_hiddens: [256, 256, 256]
# tau: 0.005
# target_entropy: auto
# no_done_at_end: false
# n_step: 3
# rollout_fragment_length: 1
# prioritized_replay: false
# train_batch_size: 256
# target_network_update_freq: 0
# timesteps_per_iteration: 1000
# learning_starts: 256
# optimization:
# actor_learning_rate: 0.0001
# critic_learning_rate: 0.0003
# entropy_learning_rate: 0.0001
# num_workers: 0
# num_gpus: 1
# metrics_smoothing_episodes: 5
#
# # CQL Configs
# min_q_weight: 5.0
# bc_iters: 20000
# temperature: 1.0
# num_actions: 10
# lagrangian: False
#
# # Switch on online evaluation.
# evaluation_interval: 3
# evaluation_config:
# input: sampler
cql-halfcheetahbulletenv-v0:
env: HalfCheetahBulletEnv-v0
run: CQL
pass_criteria:
episode_reward_mean: 400.0
timesteps_total: 10000000
stop:
time_total_s: 3600
config:
# Use input produced by expert SAC algo.
input: ["~/halfcheetah_expert_sac.zip"]
actions_in_input_normalized: true

soft_horizon: False
horizon: 1000
Q_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256, 256]
policy_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256, 256]
tau: 0.005
target_entropy: auto
no_done_at_end: false
n_step: 3
rollout_fragment_length: 1
prioritized_replay: false
train_batch_size: 256
target_network_update_freq: 0
timesteps_per_iteration: 1000
learning_starts: 256
optimization:
actor_learning_rate: 0.0001
critic_learning_rate: 0.0003
entropy_learning_rate: 0.0001
num_workers: 0
num_gpus: 1
metrics_smoothing_episodes: 5

# CQL Configs
min_q_weight: 5.0
bc_iters: 20000
temperature: 1.0
num_actions: 10
lagrangian: False

# Switch on online evaluation.
evaluation_interval: 3
evaluation_config:
input: sampler
always_attach_evaluation_results: True

ddpg-hopperbulletenv-v0:
env: HopperBulletEnv-v0
Expand Down
98 changes: 49 additions & 49 deletions release/rllib_tests/performance_tests/performance_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,55 +52,55 @@ appo-pongnoframeskip-v4:
model:
dim: 42

# Bring cql test back after we make sure it learns.
#cql-halfcheetahbulletenv-v0:
# env: HalfCheetahBulletEnv-v0
# run: CQL
# frameworks: [ "tf", "tf2", "torch" ]
# stop:
# time_total_s: 1800
# config:
# # Use input produced by expert SAC algo.
# input: ["~/halfcheetah_expert_sac.zip"]
# actions_in_input_normalized: true
#
# soft_horizon: False
# horizon: 1000
# Q_model:
# fcnet_activation: relu
# fcnet_hiddens: [256, 256, 256]
# policy_model:
# fcnet_activation: relu
# fcnet_hiddens: [256, 256, 256]
# tau: 0.005
# target_entropy: auto
# no_done_at_end: false
# n_step: 3
# rollout_fragment_length: 1
# prioritized_replay: false
# train_batch_size: 256
# target_network_update_freq: 0
# timesteps_per_iteration: 1000
# learning_starts: 256
# optimization:
# actor_learning_rate: 0.0001
# critic_learning_rate: 0.0003
# entropy_learning_rate: 0.0001
# num_workers: 0
# num_gpus: 1
# metrics_smoothing_episodes: 5
#
# # CQL Configs
# min_q_weight: 5.0
# bc_iters: 20000
# temperature: 1.0
# num_actions: 10
# lagrangian: False
#
# # Switch on online evaluation.
# evaluation_interval: 3
# evaluation_config:
# input: sampler
cql-halfcheetahbulletenv-v0:
env: HalfCheetahBulletEnv-v0
run: CQL
frameworks: [ "tf", "tf2", "torch" ]
stop:
time_total_s: 1800
config:
# Use input produced by expert SAC algo.
input: ["~/halfcheetah_expert_sac.zip"]
actions_in_input_normalized: true

soft_horizon: False
horizon: 1000
Q_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256, 256]
policy_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256, 256]
tau: 0.005
target_entropy: auto
no_done_at_end: false
n_step: 3
rollout_fragment_length: 1
prioritized_replay: false
train_batch_size: 256
target_network_update_freq: 0
timesteps_per_iteration: 1000
learning_starts: 256
optimization:
actor_learning_rate: 0.0001
critic_learning_rate: 0.0003
entropy_learning_rate: 0.0001
num_workers: 0
num_gpus: 1
metrics_smoothing_episodes: 5

# CQL Configs
min_q_weight: 5.0
bc_iters: 20000
temperature: 1.0
num_actions: 10
lagrangian: False

# Switch on online evaluation.
evaluation_interval: 3
evaluation_config:
input: sampler
always_attach_evaluation_results: True

sac-halfcheetahbulletenv-v0:
env: HalfCheetahBulletEnv-v0
Expand Down
33 changes: 33 additions & 0 deletions rllib/agents/tests/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,39 @@ def test_evaluation_option(self):
self.assertTrue("episode_reward_mean" in r1["evaluation"])
self.assertNotEqual(r1["evaluation"], r3["evaluation"])

def test_evaluation_option_always_attach_eval_metrics(self):
config = dqn.DEFAULT_CONFIG.copy()
config.update({
"env": "CartPole-v0",
"evaluation_interval": 2,
"evaluation_duration": 2,
"evaluation_duration_unit": "episodes",
"evaluation_config": {
"gamma": 0.98,
},
"always_attach_evaluation_results": True,
# Use a custom callback that asserts that we are running the
# configured exact number of episodes per evaluation.
"callbacks": AssertEvalCallback,
})

for _ in framework_iterator(config, frameworks=("tf", "torch")):
trainer = dqn.DQNTrainer(config=config)
# Should always see latest available eval results.
r0 = trainer.train()
r1 = trainer.train()
r2 = trainer.train()
r3 = trainer.train()
trainer.stop()

# Eval results are not available at step 0.
# But step 3 should still have it, even though no eval was
# run during that step.
self.assertFalse("evaluation" in r0)
self.assertTrue("evaluation" in r1)
self.assertTrue("evaluation" in r2)
self.assertTrue("evaluation" in r3)

def test_evaluation_wo_evaluation_worker_set(self):
config = a3c.DEFAULT_CONFIG.copy()
config.update({
Expand Down
29 changes: 19 additions & 10 deletions rllib/agents/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,11 @@
# The Trainer guarantees all eval workers have the latest policy state
# before this function is called.
"custom_eval_function": None,
# Make sure the latest available evaluation results are always attached to
# a step result dict.
# This may be useful if Tune or some other meta controller needs access
# to evaluation metrics all the time.
"always_attach_evaluation_results": False,

# === Advanced Rollout Settings ===
# Use a background thread for sampling (slightly off-policy, usually not
Expand Down Expand Up @@ -986,7 +991,6 @@ def auto_duration_fn(unit, num_eval_workers, eval_cfg, num_units_done):
# No parallelism.
if not self.config["evaluation_parallel_to_training"]:
step_results = next(self.train_exec_impl)

# Kick off evaluation-loop (and parallel train() call,
# if requested).
# Parallel eval + training.
Expand All @@ -997,24 +1001,25 @@ def auto_duration_fn(unit, num_eval_workers, eval_cfg, num_units_done):
# Automatically determine duration of the evaluation.
if self.config["evaluation_duration"] == "auto":
unit = self.config["evaluation_duration_unit"]

evaluation_metrics = self.evaluate(
self.evaluate(
duration_fn=functools.partial(
auto_duration_fn, unit, self.config[
"evaluation_num_workers"], self.config[
"evaluation_config"]))
else:
evaluation_metrics = self.evaluate()
self.evaluate()
# Collect the training results from the future.
step_results = train_future.result()
# Sequential: train (already done above), then eval.
else:
evaluation_metrics = self.evaluate()
self.evaluate()

# Add evaluation results to train results.
assert isinstance(evaluation_metrics, dict), \
if (evaluate_this_iter
or self.config["always_attach_evaluation_results"]):
# Attach latest available evaluation results to train results.
assert isinstance(self.evaluation_metrics, dict), \
"Trainer.evaluate() needs to return a dict."
step_results.update(evaluation_metrics)
step_results.update(self.evaluation_metrics)

# Check `env_task_fn` for possible update of the env's task.
if self.config["env_task_fn"] is not None:
Expand Down Expand Up @@ -1176,9 +1181,13 @@ def duration_fn(num_units_done):
self.evaluation_workers.remote_workers())
metrics["timesteps_this_iter"] = num_ts_run

self.evaluation_metrics = metrics
# Evaluation does not run for every step.
# Save evaluation metrics on trainer, so it can be attached to
# subsequent step results as latest evaluation result.
self.evaluation_metrics = {"evaluation": metrics}

return {"evaluation": metrics}
# Also return the results here for convenience.
return self.evaluation_metrics

@DeveloperAPI
@staticmethod
Expand Down

0 comments on commit 767f78e

Please sign in to comment.