Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Provide more constants for common result dict keys, e.g. EPISODE_RETURN_MEAN. #45330

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rllib/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ Quick First Experiment
# we can expect to reach an optimal episode reward of 0.0.
for i in range(1):
results = algo.train()
print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
print(f"Iter: {i}; avg. return={results['env_runner_results/episode_return_mean']}")

.. testoutput::
:options: +MOCK
Expand Down
21 changes: 12 additions & 9 deletions rllib/algorithms/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@
ALL_MODULES,
ENV_RUNNER_RESULTS,
ENV_RUNNER_SAMPLING_TIMER,
EPISODE_RETURN_MAX,
EPISODE_RETURN_MEAN,
EPISODE_RETURN_MIN,
EVALUATION_ITERATION_TIMER,
EVALUATION_RESULTS,
FAULT_TOLERANCE_STATS,
Expand Down Expand Up @@ -540,13 +543,13 @@ def default_logger_creator(config):
self.evaluation_metrics = {
# TODO: Don't dump sampler results into top-level.
"evaluation": {
"episode_reward_max": np.nan,
"episode_reward_min": np.nan,
"episode_reward_mean": np.nan,
"sampler_results": {
"episode_reward_max": np.nan,
"episode_reward_min": np.nan,
"episode_reward_mean": np.nan,
EPISODE_RETURN_MAX: np.nan,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

EPISODE_RETURN_MIN: np.nan,
EPISODE_RETURN_MEAN: np.nan,
ENV_RUNNER_RESULTS: {
EPISODE_RETURN_MAX: np.nan,
EPISODE_RETURN_MIN: np.nan,
EPISODE_RETURN_MEAN: np.nan,
},
},
}
Expand Down Expand Up @@ -3339,7 +3342,7 @@ def _run_one_training_iteration_and_evaluation_in_parallel_wo_thread(

# Warn if results are empty, it could be that this is because the eval timesteps
# are not enough to run through one full episode.
if eval_results["sampler_results"][NUM_EPISODES] == 0:
if eval_results[ENV_RUNNER_RESULTS][NUM_EPISODES] == 0:
logger.warning(
"This evaluation iteration resulted in an empty set of episode summary "
"results! It's possible that your configured duration timesteps are not"
Expand Down Expand Up @@ -3481,7 +3484,7 @@ def _compile_iteration_results_old_and_hybrid_api_stacks(
self.config.keep_per_episode_custom_metrics,
)
# TODO: Don't dump sampler results into top-level.
results.update(results["sampler_results"])
results.update(results[ENV_RUNNER_RESULTS])

results["num_healthy_workers"] = self.workers.num_healthy_remote_workers()
results["num_in_flight_async_reqs"] = self.workers.num_in_flight_async_reqs()
Expand Down
2 changes: 1 addition & 1 deletion rllib/algorithms/algorithm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2141,7 +2141,7 @@ def evaluation(
it wastes no extra time for evaluation - causes the evaluation results
to lag one iteration behind the rest of the training results. This is
important when picking a good checkpoint. For example, if iteration 42
reports a good evaluation `episode_reward_mean`, be aware that these
reports a good evaluation `episode_return_mean`, be aware that these
results were achieved on the weights trained in iteration 41, so you
should probably pick the iteration 41 checkpoint instead.
evaluation_force_reset_envs_before_iteration: Whether all environments
Expand Down
16 changes: 14 additions & 2 deletions rllib/algorithms/bc/tests/test_bc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

import ray
import ray.rllib.algorithms.bc as bc
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)
from ray.rllib.utils.test_utils import (
check_compute_single_action,
check_train_results,
Expand Down Expand Up @@ -77,11 +81,19 @@ def test_bc_compilation_and_learning_from_offline_file(self):
if eval_results:
print(
"iter={} R={}".format(
i, eval_results["episode_reward_mean"]
i,
eval_results[
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
],
)
)
# Learn until good reward is reached in the actual env.
if eval_results["episode_reward_mean"] > min_reward:
if (
eval_results[
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
]
> min_reward
):
print("learnt!")
learnt = True
break
Expand Down
7 changes: 6 additions & 1 deletion rllib/algorithms/cql/tests/test_cql.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import ray
from ray.rllib.algorithms import cql
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)
from ray.rllib.utils.test_utils import (
check_compute_single_action,
check_train_results,
Expand Down Expand Up @@ -80,7 +84,8 @@ def test_cql_compilation(self):
print(results)
eval_results = results["evaluation"]
print(
f"iter={algo.iteration} " f"R={eval_results['episode_reward_mean']}"
f"iter={algo.iteration} "
f"R={eval_results[ENV_RUNNER_RESULTS + '/' + EPISODE_RETURN_MEAN]}"
)
check_compute_single_action(algo)

Expand Down
13 changes: 11 additions & 2 deletions rllib/algorithms/marwil/tests/test_marwil.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
from ray.rllib.evaluation.postprocessing import compute_advantages
from ray.rllib.offline import JsonReader
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)
from ray.rllib.utils.test_utils import (
check,
check_compute_single_action,
Expand Down Expand Up @@ -75,10 +79,15 @@ def test_marwil_compilation_and_learning_from_offline_file(self):
eval_results = results.get("evaluation")
if eval_results:
print(
"iter={} R={} ".format(i, eval_results["episode_reward_mean"])
"iter={} R={} ".format(
i, eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
)
)
# Learn until some reward is reached on an actual live env.
if eval_results["episode_reward_mean"] > min_reward:
if (
eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
> min_reward
):
print("learnt!")
learnt = True
break
Expand Down
12 changes: 10 additions & 2 deletions rllib/algorithms/tests/test_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
from ray.rllib.examples.evaluation.evaluation_parallel_to_training import (
AssertEvalCallback,
)
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)
from ray.rllib.utils.metrics.learner_info import LEARNER_INFO
from ray.rllib.utils.test_utils import check, framework_iterator

Expand Down Expand Up @@ -268,7 +272,8 @@ def test_evaluation_option(self):
self.assertTrue("evaluation" in r1)
self.assertFalse("evaluation" in r2)
self.assertTrue("evaluation" in r3)
self.assertTrue("episode_reward_mean" in r1["evaluation"])
self.assertTrue(ENV_RUNNER_RESULTS in r1["evaluation"])
self.assertTrue(EPISODE_RETURN_MEAN in r1["evaluation"][ENV_RUNNER_RESULTS])
self.assertNotEqual(r1["evaluation"], r3["evaluation"])

def test_evaluation_option_always_attach_eval_metrics(self):
Expand Down Expand Up @@ -331,7 +336,10 @@ def test_evaluation_wo_evaluation_env_runner_group(self):
config.create_env_on_local_worker = True
algo_w_env_on_local_worker = config.build()
results = algo_w_env_on_local_worker.evaluate()
assert "episode_reward_mean" in results
assert (
ENV_RUNNER_RESULTS in results
and EPISODE_RETURN_MEAN in results[ENV_RUNNER_RESULTS]
)
algo_w_env_on_local_worker.stop()
config.create_env_on_local_worker = False

Expand Down
25 changes: 19 additions & 6 deletions rllib/env/tests/test_multi_agent_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
convert_ma_batch_to_sample_batch,
)
from ray.rllib.tests.test_nested_observation_spaces import NestedMultiAgentEnv
from ray.rllib.utils.metrics import (
NUM_ENV_STEPS_SAMPLED_LIFETIME,
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)
from ray.rllib.utils.numpy import one_hot
from ray.rllib.utils.test_utils import check

Expand Down Expand Up @@ -600,7 +605,9 @@ def test_multi_agent_with_flex_agents(self):
result = algo.train()
print(
"Iteration {}, reward {}, timesteps {}".format(
i, result["episode_reward_mean"], result["timesteps_total"]
i,
result[f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"],
result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"],
)
)
algo.stop()
Expand All @@ -620,7 +627,9 @@ def test_multi_agent_with_sometimes_zero_agents_observing(self):
result = algo.train()
print(
"Iteration {}, reward {}, timesteps {}".format(
i, result["episode_reward_mean"], result["timesteps_total"]
i,
result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN],
result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"],
)
)
algo.stop()
Expand Down Expand Up @@ -682,7 +691,7 @@ def compute_actions(
episodes=None,
explore=True,
timestep=None,
**kwargs
**kwargs,
):
obs_shape = (len(obs_batch),)
actions = np.zeros(obs_shape, dtype=np.int32)
Expand Down Expand Up @@ -817,10 +826,12 @@ def test_train_multi_agent_cartpole_single_policy(self):
result = algo.train()
print(
"Iteration {}, reward {}, timesteps {}".format(
i, result["episode_reward_mean"], result["timesteps_total"]
i,
result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN],
result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"],
)
)
if result["episode_reward_mean"] >= 50 * n:
if result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= 50 * n:
algo.stop()
return
raise Exception("failed to improve reward")
Expand Down Expand Up @@ -861,7 +872,9 @@ def gen_policy():
result = algo.train()
print(
"Iteration {}, reward {}, timesteps {}".format(
i, result["episode_reward_mean"], result["timesteps_total"]
i,
result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN],
result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"],
)
)
self.assertTrue(
Expand Down
19 changes: 12 additions & 7 deletions rllib/evaluation/tests/test_rollout_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@
convert_ma_batch_to_sample_batch,
)
from ray.rllib.utils.annotations import override
from ray.rllib.utils.metrics import NUM_AGENT_STEPS_SAMPLED, NUM_AGENT_STEPS_TRAINED
from ray.rllib.utils.metrics import (
NUM_AGENT_STEPS_SAMPLED,
NUM_AGENT_STEPS_TRAINED,
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)
from ray.rllib.utils.test_utils import check, framework_iterator
from ray.tune.registry import register_env

Expand Down Expand Up @@ -490,12 +495,12 @@ def test_reward_clipping(self):
result = collect_metrics(ws, [])
# Shows different behavior when connector is on/off.
if config.enable_connectors:
# episode_reward_mean shows the correct clipped value.
self.assertEqual(result["episode_reward_mean"], 10)
# episode_return_mean shows the correct clipped value.
self.assertEqual(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN], 10)
else:
# episode_reward_mean shows the unclipped raw value
# episode_return_mean shows the unclipped raw value
# when connector is off, and old env_runner v1 is used.
self.assertEqual(result["episode_reward_mean"], 1000)
self.assertEqual(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN], 1000)
ev.stop()

# Clipping in certain range (-2.0, 2.0).
Expand Down Expand Up @@ -534,7 +539,7 @@ def test_reward_clipping(self):
)
self.assertEqual(max(sample["rewards"]), 100)
result2 = collect_metrics(ws2, [])
self.assertEqual(result2["episode_reward_mean"], 1000)
self.assertEqual(result2[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN], 1000)
ev2.stop()

def test_metrics(self):
Expand Down Expand Up @@ -564,7 +569,7 @@ def test_metrics(self):
ray.get(remote_ev.sample.remote())
result = collect_metrics(ws)
self.assertEqual(result["episodes_this_iter"], 20)
self.assertEqual(result["episode_reward_mean"], 10)
self.assertEqual(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN], 10)
ev.stop()

def test_auto_vectorization(self):
Expand Down
11 changes: 9 additions & 2 deletions rllib/evaluation/tests/test_trajectory_view_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from ray.rllib.policy.view_requirement import ViewRequirement
from ray.rllib.utils.annotations import override
from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
from ray.rllib.utils.test_utils import framework_iterator, check


Expand Down Expand Up @@ -222,7 +223,10 @@ def test_traj_view_attention_net(self):
sample = rw.sample()
assert sample.count == algo.config.get_rollout_fragment_length()
results = algo.train()
assert results["timesteps_total"] == config["train_batch_size"]
assert (
results[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"]
== config["train_batch_size"]
)
algo.stop()

def test_traj_view_next_action(self):
Expand Down Expand Up @@ -352,7 +356,10 @@ def test_counting_by_agent_steps(self):
results = None
for i in range(num_iterations):
results = algo.train()
self.assertEqual(results["agent_timesteps_total"], results["timesteps_total"])
self.assertEqual(
results["agent_timesteps_total"],
results[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"],
)
self.assertEqual(
results["num_env_steps_trained"] * num_agents,
results["num_agent_steps_trained"],
Expand Down
6 changes: 5 additions & 1 deletion rllib/examples/_docs/rllib_on_rllib_readme.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import gymnasium as gym
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)


# Define your problem using python and Farama-Foundation's gymnasium API:
Expand Down Expand Up @@ -69,7 +73,7 @@ def step(self, action):
# we can expect to reach an optimal episode reward of 0.0.
for i in range(5):
results = algo.train()
print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
print(f"Iter: {i}; avg. reward={results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}")

# Perform inference (action computations) based on given env observations.
# Note that we are using a slightly simpler env here (-3.0 to 3.0, instead
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
create_open_spiel_checkpoint,
)
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
NUM_ENV_STEPS_SAMPLED_LIFETIME,
NUM_EPISODES,
)
from ray.tune import CLIReporter, register_env


Expand Down Expand Up @@ -117,10 +122,10 @@ def main(checkpoint_dir):
metric_columns={
"training_iteration": "iter",
"time_total_s": "time_total_s",
"num_env_steps_sampled_lifetime": "ts",
"env_runner_results/num_episodes": "train_episodes",
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": "ts",
f"{ENV_RUNNER_RESULTS}/{NUM_EPISODES}": "train_episodes",
(
"env_runner_results/module_episode_returns_mean/" "main"
f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/" "main"
): "reward_main",
},
sort_by_metric=True,
Expand Down
6 changes: 5 additions & 1 deletion rllib/examples/_old_api_stack/custom_keras_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
from ray.rllib.models.tf.visionnet import VisionNetwork as MyVisionNetwork
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
)
from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
from ray.tune.registry import get_trainable_cls

Expand Down Expand Up @@ -142,7 +146,7 @@ def on_train_result(self, *, algorithm, result, **kwargs):
)

stop = {
"env_runner_results/episode_return_mean": args.stop,
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop,
}

tuner = tune.Tuner(
Expand Down
Loading
Loading