diff --git a/release/rllib_tests/learning_tests/hard_learning_tests.yaml b/release/rllib_tests/learning_tests/hard_learning_tests.yaml index b7eeb8cb574f..f7c6a242fecd 100644 --- a/release/rllib_tests/learning_tests/hard_learning_tests.yaml +++ b/release/rllib_tests/learning_tests/hard_learning_tests.yaml @@ -1,4 +1,3 @@ - a2c-breakoutnoframeskip-v4: env: BreakoutNoFrameskip-v4 run: A2C @@ -163,56 +162,57 @@ appo-pongnoframeskip-v4: # evaluation_config: # input: sampler -# cql-halfcheetahbulletenv-v0: -# env: HalfCheetahBulletEnv-v0 -# run: CQL -# pass_criteria: -# episode_reward_mean: 400.0 -# timesteps_total: 10000000 -# stop: -# time_total_s: 3600 -# config: -# # Use input produced by expert SAC algo. -# input: ["~/halfcheetah_expert_sac.zip"] -# actions_in_input_normalized: true -# -# soft_horizon: False -# horizon: 1000 -# Q_model: -# fcnet_activation: relu -# fcnet_hiddens: [256, 256, 256] -# policy_model: -# fcnet_activation: relu -# fcnet_hiddens: [256, 256, 256] -# tau: 0.005 -# target_entropy: auto -# no_done_at_end: false -# n_step: 3 -# rollout_fragment_length: 1 -# prioritized_replay: false -# train_batch_size: 256 -# target_network_update_freq: 0 -# timesteps_per_iteration: 1000 -# learning_starts: 256 -# optimization: -# actor_learning_rate: 0.0001 -# critic_learning_rate: 0.0003 -# entropy_learning_rate: 0.0001 -# num_workers: 0 -# num_gpus: 1 -# metrics_smoothing_episodes: 5 -# -# # CQL Configs -# min_q_weight: 5.0 -# bc_iters: 20000 -# temperature: 1.0 -# num_actions: 10 -# lagrangian: False -# -# # Switch on online evaluation. -# evaluation_interval: 3 -# evaluation_config: -# input: sampler +cql-halfcheetahbulletenv-v0: + env: HalfCheetahBulletEnv-v0 + run: CQL + pass_criteria: + episode_reward_mean: 400.0 + timesteps_total: 10000000 + stop: + time_total_s: 3600 + config: + # Use input produced by expert SAC algo. + input: ["~/halfcheetah_expert_sac.zip"] + actions_in_input_normalized: true + + soft_horizon: False + horizon: 1000 + Q_model: + fcnet_activation: relu + fcnet_hiddens: [256, 256, 256] + policy_model: + fcnet_activation: relu + fcnet_hiddens: [256, 256, 256] + tau: 0.005 + target_entropy: auto + no_done_at_end: false + n_step: 3 + rollout_fragment_length: 1 + prioritized_replay: false + train_batch_size: 256 + target_network_update_freq: 0 + timesteps_per_iteration: 1000 + learning_starts: 256 + optimization: + actor_learning_rate: 0.0001 + critic_learning_rate: 0.0003 + entropy_learning_rate: 0.0001 + num_workers: 0 + num_gpus: 1 + metrics_smoothing_episodes: 5 + + # CQL Configs + min_q_weight: 5.0 + bc_iters: 20000 + temperature: 1.0 + num_actions: 10 + lagrangian: False + + # Switch on online evaluation. + evaluation_interval: 3 + evaluation_config: + input: sampler + always_attach_evaluation_results: True ddpg-hopperbulletenv-v0: env: HopperBulletEnv-v0 diff --git a/release/rllib_tests/performance_tests/performance_tests.yaml b/release/rllib_tests/performance_tests/performance_tests.yaml index fdc0aa3d045e..51de3a7181c3 100644 --- a/release/rllib_tests/performance_tests/performance_tests.yaml +++ b/release/rllib_tests/performance_tests/performance_tests.yaml @@ -52,55 +52,55 @@ appo-pongnoframeskip-v4: model: dim: 42 -# Bring cql test back after we make sure it learns. -#cql-halfcheetahbulletenv-v0: -# env: HalfCheetahBulletEnv-v0 -# run: CQL -# frameworks: [ "tf", "tf2", "torch" ] -# stop: -# time_total_s: 1800 -# config: -# # Use input produced by expert SAC algo. -# input: ["~/halfcheetah_expert_sac.zip"] -# actions_in_input_normalized: true -# -# soft_horizon: False -# horizon: 1000 -# Q_model: -# fcnet_activation: relu -# fcnet_hiddens: [256, 256, 256] -# policy_model: -# fcnet_activation: relu -# fcnet_hiddens: [256, 256, 256] -# tau: 0.005 -# target_entropy: auto -# no_done_at_end: false -# n_step: 3 -# rollout_fragment_length: 1 -# prioritized_replay: false -# train_batch_size: 256 -# target_network_update_freq: 0 -# timesteps_per_iteration: 1000 -# learning_starts: 256 -# optimization: -# actor_learning_rate: 0.0001 -# critic_learning_rate: 0.0003 -# entropy_learning_rate: 0.0001 -# num_workers: 0 -# num_gpus: 1 -# metrics_smoothing_episodes: 5 -# -# # CQL Configs -# min_q_weight: 5.0 -# bc_iters: 20000 -# temperature: 1.0 -# num_actions: 10 -# lagrangian: False -# -# # Switch on online evaluation. -# evaluation_interval: 3 -# evaluation_config: -# input: sampler +cql-halfcheetahbulletenv-v0: + env: HalfCheetahBulletEnv-v0 + run: CQL + frameworks: [ "tf", "tf2", "torch" ] + stop: + time_total_s: 1800 + config: + # Use input produced by expert SAC algo. + input: ["~/halfcheetah_expert_sac.zip"] + actions_in_input_normalized: true + + soft_horizon: False + horizon: 1000 + Q_model: + fcnet_activation: relu + fcnet_hiddens: [256, 256, 256] + policy_model: + fcnet_activation: relu + fcnet_hiddens: [256, 256, 256] + tau: 0.005 + target_entropy: auto + no_done_at_end: false + n_step: 3 + rollout_fragment_length: 1 + prioritized_replay: false + train_batch_size: 256 + target_network_update_freq: 0 + timesteps_per_iteration: 1000 + learning_starts: 256 + optimization: + actor_learning_rate: 0.0001 + critic_learning_rate: 0.0003 + entropy_learning_rate: 0.0001 + num_workers: 0 + num_gpus: 1 + metrics_smoothing_episodes: 5 + + # CQL Configs + min_q_weight: 5.0 + bc_iters: 20000 + temperature: 1.0 + num_actions: 10 + lagrangian: False + + # Switch on online evaluation. + evaluation_interval: 3 + evaluation_config: + input: sampler + always_attach_evaluation_results: True sac-halfcheetahbulletenv-v0: env: HalfCheetahBulletEnv-v0 diff --git a/rllib/agents/tests/test_trainer.py b/rllib/agents/tests/test_trainer.py index 479d7cae1d90..09b4314d0dc3 100644 --- a/rllib/agents/tests/test_trainer.py +++ b/rllib/agents/tests/test_trainer.py @@ -161,6 +161,39 @@ def test_evaluation_option(self): self.assertTrue("episode_reward_mean" in r1["evaluation"]) self.assertNotEqual(r1["evaluation"], r3["evaluation"]) + def test_evaluation_option_always_attach_eval_metrics(self): + config = dqn.DEFAULT_CONFIG.copy() + config.update({ + "env": "CartPole-v0", + "evaluation_interval": 2, + "evaluation_duration": 2, + "evaluation_duration_unit": "episodes", + "evaluation_config": { + "gamma": 0.98, + }, + "always_attach_evaluation_results": True, + # Use a custom callback that asserts that we are running the + # configured exact number of episodes per evaluation. + "callbacks": AssertEvalCallback, + }) + + for _ in framework_iterator(config, frameworks=("tf", "torch")): + trainer = dqn.DQNTrainer(config=config) + # Should always see latest available eval results. + r0 = trainer.train() + r1 = trainer.train() + r2 = trainer.train() + r3 = trainer.train() + trainer.stop() + + # Eval results are not available at step 0. + # But step 3 should still have it, even though no eval was + # run during that step. + self.assertFalse("evaluation" in r0) + self.assertTrue("evaluation" in r1) + self.assertTrue("evaluation" in r2) + self.assertTrue("evaluation" in r3) + def test_evaluation_wo_evaluation_worker_set(self): config = a3c.DEFAULT_CONFIG.copy() config.update({ diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index b4b8736c61f1..9a52fbb62a3f 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -321,6 +321,11 @@ # The Trainer guarantees all eval workers have the latest policy state # before this function is called. "custom_eval_function": None, + # Make sure the latest available evaluation results are always attached to + # a step result dict. + # This may be useful if Tune or some other meta controller needs access + # to evaluation metrics all the time. + "always_attach_evaluation_results": False, # === Advanced Rollout Settings === # Use a background thread for sampling (slightly off-policy, usually not @@ -986,7 +991,6 @@ def auto_duration_fn(unit, num_eval_workers, eval_cfg, num_units_done): # No parallelism. if not self.config["evaluation_parallel_to_training"]: step_results = next(self.train_exec_impl) - # Kick off evaluation-loop (and parallel train() call, # if requested). # Parallel eval + training. @@ -997,24 +1001,25 @@ def auto_duration_fn(unit, num_eval_workers, eval_cfg, num_units_done): # Automatically determine duration of the evaluation. if self.config["evaluation_duration"] == "auto": unit = self.config["evaluation_duration_unit"] - - evaluation_metrics = self.evaluate( + self.evaluate( duration_fn=functools.partial( auto_duration_fn, unit, self.config[ "evaluation_num_workers"], self.config[ "evaluation_config"])) else: - evaluation_metrics = self.evaluate() + self.evaluate() # Collect the training results from the future. step_results = train_future.result() # Sequential: train (already done above), then eval. else: - evaluation_metrics = self.evaluate() + self.evaluate() - # Add evaluation results to train results. - assert isinstance(evaluation_metrics, dict), \ + if (evaluate_this_iter + or self.config["always_attach_evaluation_results"]): + # Attach latest available evaluation results to train results. + assert isinstance(self.evaluation_metrics, dict), \ "Trainer.evaluate() needs to return a dict." - step_results.update(evaluation_metrics) + step_results.update(self.evaluation_metrics) # Check `env_task_fn` for possible update of the env's task. if self.config["env_task_fn"] is not None: @@ -1176,9 +1181,13 @@ def duration_fn(num_units_done): self.evaluation_workers.remote_workers()) metrics["timesteps_this_iter"] = num_ts_run - self.evaluation_metrics = metrics + # Evaluation does not run for every step. + # Save evaluation metrics on trainer, so it can be attached to + # subsequent step results as latest evaluation result. + self.evaluation_metrics = {"evaluation": metrics} - return {"evaluation": metrics} + # Also return the results here for convenience. + return self.evaluation_metrics @DeveloperAPI @staticmethod