From ab81c8e9ca941e7029efcd6ccdbb0a73135a48b7 Mon Sep 17 00:00:00 2001 From: Rohan Potdar <105385119+rapotdar@users.noreply.github.com> Date: Fri, 27 May 2022 04:14:54 -0700 Subject: [PATCH] [RLlib]: Rename `input_evaluation` to `off_policy_estimation_methods`. (#25107) --- doc/source/rllib/rllib-env.rst | 2 +- doc/source/rllib/rllib-offline.rst | 8 +++--- doc/source/rllib/rllib-training.rst | 2 +- .../marwil-halfcheetahbulletenv-v0.yaml | 2 +- rllib/BUILD | 2 +- rllib/agents/trainer.py | 18 ++++++++----- rllib/agents/trainer_config.py | 27 +++++++++++-------- rllib/algorithms/cql/cql.py | 2 +- rllib/algorithms/cql/tests/test_cql.py | 2 +- rllib/algorithms/marwil/bc.py | 2 +- rllib/algorithms/marwil/marwil.py | 4 ++- rllib/algorithms/marwil/tests/test_marwil.py | 4 ++- rllib/evaluation/rollout_worker.py | 23 +++++++++------- rllib/evaluation/worker_set.py | 6 ++--- .../parallel_evaluation_and_training.py | 3 --- rllib/examples/serving/cartpole_server.py | 2 +- rllib/examples/serving/unity3d_server.py | 2 +- rllib/offline/off_policy_estimator.py | 8 +++--- rllib/tests/test_io.py | 14 +++++----- rllib/tests/test_nested_action_spaces.py | 2 +- 20 files changed, 74 insertions(+), 61 deletions(-) diff --git a/doc/source/rllib/rllib-env.rst b/doc/source/rllib/rllib-env.rst index 0939001b5055..7a675e6b51bf 100644 --- a/doc/source/rllib/rllib-env.rst +++ b/doc/source/rllib/rllib-env.rst @@ -521,7 +521,7 @@ You can configure any Trainer to launch a policy server with the following confi # Use the existing trainer process to run the server. "num_workers": 0, # Disable OPE, since the rollouts are coming from online clients. - "input_evaluation": [], + "off_policy_estimation_methods": [], } Clients can then connect in either *local* or *remote* inference mode. In local inference mode, copies of the policy are downloaded from the server and cached on the client for a configurable period of time. This allows actions to be computed by the client without requiring a network round trip each time. In remote inference mode, each computed action requires a network call to the server. diff --git a/doc/source/rllib/rllib-offline.rst b/doc/source/rllib/rllib-offline.rst index b9fe003ae49d..446526ab4b3c 100644 --- a/doc/source/rllib/rllib-offline.rst +++ b/doc/source/rllib/rllib-offline.rst @@ -48,7 +48,7 @@ Then, we can tell DQN to train using these previously generated experiences with --env=CartPole-v0 \ --config='{ "input": "/tmp/cartpole-out", - "input_evaluation": [], + "off_policy_estimation_methods": [], "explore": false}' .. _is: @@ -62,7 +62,7 @@ Then, we can tell DQN to train using these previously generated experiences with --env=CartPole-v0 \ --config='{ "input": "/tmp/cartpole-out", - "input_evaluation": ["is", "wis"], + "off_policy_estimation_methods": ["is", "wis"], "exploration_config": { "type": "SoftQ", "temperature": 1.0, @@ -90,7 +90,7 @@ This example plot shows the Q-value metric in addition to importance sampling (I print(estimator.estimate(episode)) -**Simulation-based estimation:** If true simulation is also possible (i.e., your env supports ``step()``), you can also set ``"input_evaluation": ["simulation"]`` to tell RLlib to run background simulations to estimate current policy performance. The output of these simulations will not be used for learning. Note that in all cases you still need to specify an environment object to define the action and observation spaces. However, you don't need to implement functions like reset() and step(). +**Simulation-based estimation:** If true simulation is also possible (i.e., your env supports ``step()``), you can also set ``"off_policy_estimation_methods": ["simulation"]`` to tell RLlib to run background simulations to estimate current policy performance. The output of these simulations will not be used for learning. Note that in all cases you still need to specify an environment object to define the action and observation spaces. However, you don't need to implement functions like reset() and step(). Example: Converting external experiences to batch format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -270,7 +270,7 @@ You can configure experience input for an agent using the following options: # - Any subclass of OffPolicyEstimator, e.g. # ray.rllib.offline.estimators.is::ImportanceSampling or your own custom # subclass. - "input_evaluation": [ + "off_policy_estimation_methods": [ ImportanceSampling, WeightedImportanceSampling, ], diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index b1072c75216a..280f0fac3cff 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -574,7 +574,7 @@ The following is a list of the common algorithm hyper-parameters: # - Any subclass of OffPolicyEstimator, e.g. # ray.rllib.offline.estimators.is::ImportanceSampling or your own custom # subclass. - "input_evaluation": [ + "off_policy_estimation_methods": [ ImportanceSampling, WeightedImportanceSampling, ], diff --git a/release/rllib_tests/learning_tests/yaml_files/marwil-halfcheetahbulletenv-v0.yaml b/release/rllib_tests/learning_tests/yaml_files/marwil-halfcheetahbulletenv-v0.yaml index 101b4f4c7916..27d59e39c897 100644 --- a/release/rllib_tests/learning_tests/yaml_files/marwil-halfcheetahbulletenv-v0.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/marwil-halfcheetahbulletenv-v0.yaml @@ -11,7 +11,7 @@ marwil-halfcheetahbulletenv-v0: input: ["~/halfcheetah_expert_sac.zip"] actions_in_input_normalized: true # Switch off input evaluation (data does not contain action probs). - input_evaluation: [] + off_policy_estimation_methods: [] num_gpus: 1 diff --git a/rllib/BUILD b/rllib/BUILD index afcf26ced1ad..9c78ec42abd3 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1116,7 +1116,7 @@ py_test( "--env", "CartPole-v0", "--run", "DQN", "--stop", "'{\"training_iteration\": 1}'", - "--config", "'{\"framework\": \"tf\", \"input\": \"tests/data/cartpole\", \"replay_buffer_config\": {\"learning_starts\": 0}, \"input_evaluation\": [\"wis\", \"is\"], \"exploration_config\": {\"type\": \"SoftQ\"}}'" + "--config", "'{\"framework\": \"tf\", \"input\": \"tests/data/cartpole\", \"replay_buffer_config\": {\"learning_starts\": 0}, \"off_policy_estimation_methods\": [\"wis\", \"is\"], \"exploration_config\": {\"type\": \"SoftQ\"}}'" ] ) diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 077562fc771c..ba5ea1c14d70 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -34,6 +34,7 @@ from ray.rllib.env.env_context import EnvContext from ray.rllib.env.utils import _gym_env_creator from ray.rllib.evaluation.episode import Episode +from ray.rllib.utils import force_list from ray.rllib.evaluation.metrics import ( collect_episodes, collect_metrics, @@ -1890,14 +1891,17 @@ def validate_config(self, config: TrainerConfigDict) -> None: ) # Offline RL settings. - if isinstance(config["input_evaluation"], tuple): - config["input_evaluation"] = list(config["input_evaluation"]) - elif not isinstance(config["input_evaluation"], list): - raise ValueError( - "`input_evaluation` must be a list of strings, got {}!".format( - config["input_evaluation"] - ) + input_evaluation = config.get("input_evaluation") + if input_evaluation is not None and input_evaluation is not DEPRECATED_VALUE: + deprecation_warning( + old="config.input_evaluation: {}".format(input_evaluation), + new="config.off_policy_estimation_methods={}".format(input_evaluation), + error=False, ) + config["off_policy_estimation_methods"] = input_evaluation + config["off_policy_estimation_methods"] = force_list( + config["off_policy_estimation_methods"] + ) # Check model config. # If no preprocessing, propagate into model's config as well diff --git a/rllib/agents/trainer_config.py b/rllib/agents/trainer_config.py index ff3857ea8532..8543829cfcef 100644 --- a/rllib/agents/trainer_config.py +++ b/rllib/agents/trainer_config.py @@ -15,12 +15,8 @@ from ray.rllib.evaluation.collectors.sample_collector import SampleCollector from ray.rllib.evaluation.collectors.simple_list_collector import SimpleListCollector from ray.rllib.models import MODEL_DEFAULTS -from ray.rllib.offline.estimators.importance_sampling import ImportanceSampling -from ray.rllib.offline.estimators.weighted_importance_sampling import ( - WeightedImportanceSampling, -) from ray.rllib.utils import deep_update, merge_dicts -from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.typing import ( EnvConfigDict, EnvType, @@ -170,10 +166,7 @@ def __init__(self, trainer_class=None): self.input_ = "sampler" self.input_config = {} self.actions_in_input_normalized = False - self.input_evaluation = [ - ImportanceSampling, - WeightedImportanceSampling, - ] + self.off_policy_estimation_methods = [] self.postprocess_inputs = False self.shuffle_buffer_size = 0 self.output = None @@ -236,6 +229,7 @@ def __init__(self, trainer_class=None): self.prioritized_replay_alpha = DEPRECATED_VALUE self.prioritized_replay_beta = DEPRECATED_VALUE self.prioritized_replay_eps = DEPRECATED_VALUE + self.input_evaluation = DEPRECATED_VALUE def to_dict(self) -> TrainerConfigDict: """Converts all settings into a legacy config dict for backward compatibility. @@ -862,6 +856,7 @@ def offline_data( input_config=None, actions_in_input_normalized=None, input_evaluation=None, + off_policy_estimation_methods=None, postprocess_inputs=None, shuffle_buffer_size=None, output=None, @@ -906,7 +901,8 @@ def offline_data( are already normalized (between -1.0 and 1.0). This is usually the case when the offline file has been generated by another RLlib algorithm (e.g. PPO or SAC), while "normalize_actions" was set to True. - input_evaluation: Specify how to evaluate the current policy. + input_evaluation: DEPRECATED: Use `off_policy_estimation_methods` instead! + off_policy_estimation_methods: Specify how to evaluate the current policy. This only has an effect when reading offline experiences ("input" is not "sampler"). Available options: @@ -945,7 +941,16 @@ def offline_data( if actions_in_input_normalized is not None: self.actions_in_input_normalized = actions_in_input_normalized if input_evaluation is not None: - self.input_evaluation = input_evaluation + deprecation_warning( + old="offline_data(input_evaluation={})".format(input_evaluation), + new="offline_data(off_policy_estimation_methods={})".format( + input_evaluation + ), + error=True, + ) + self.off_policy_estimation_methods = input_evaluation + if off_policy_estimation_methods is not None: + self.off_policy_estimation_methods = off_policy_estimation_methods if postprocess_inputs is not None: self.postprocess_inputs = postprocess_inputs if shuffle_buffer_size is not None: diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py index 7fd06c61a91c..4d252dba3314 100644 --- a/rllib/algorithms/cql/cql.py +++ b/rllib/algorithms/cql/cql.py @@ -67,7 +67,7 @@ def __init__(self, trainer_class=None): # Changes to Trainer's/SACConfig's default: # .offline_data() - self.input_evaluation = [] + self.off_policy_estimation_methods = [] # .reporting() self.min_sample_timesteps_per_reporting = 0 diff --git a/rllib/algorithms/cql/tests/test_cql.py b/rllib/algorithms/cql/tests/test_cql.py index b36ffb089f9c..bcb56fc9bd2e 100644 --- a/rllib/algorithms/cql/tests/test_cql.py +++ b/rllib/algorithms/cql/tests/test_cql.py @@ -51,7 +51,7 @@ def test_cql_compilation(self): # RLlib algorithm (e.g. PPO or SAC). actions_in_input_normalized=False, # Switch on off-policy evaluation. - input_evaluation=["is"], + off_policy_estimation_methods=["is"], ) .training( clip_actions=False, diff --git a/rllib/algorithms/marwil/bc.py b/rllib/algorithms/marwil/bc.py index ee754b98bec4..a562e86ec7c2 100644 --- a/rllib/algorithms/marwil/bc.py +++ b/rllib/algorithms/marwil/bc.py @@ -49,7 +49,7 @@ def __init__(self, trainer_class=None): # not important for behavioral cloning. self.postprocess_inputs = False # No reward estimation. - self.input_evaluation = [] + self.off_policy_estimation_methods = [] # __sphinx_doc_end__ # fmt: on diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py index 92ae2a02aa3c..45ea833aa023 100644 --- a/rllib/algorithms/marwil/marwil.py +++ b/rllib/algorithms/marwil/marwil.py @@ -103,7 +103,9 @@ def __init__(self, trainer_class=None): # the same line. self.input_ = "sampler" # Use importance sampling estimators for reward. - self.input_evaluation = [ImportanceSampling, WeightedImportanceSampling] + self.off_policy_estimation_methods = [ + ImportanceSampling, WeightedImportanceSampling + ] self.postprocess_inputs = True self.lr = 1e-4 self.train_batch_size = 2000 diff --git a/rllib/algorithms/marwil/tests/test_marwil.py b/rllib/algorithms/marwil/tests/test_marwil.py index d1e2e14ca691..a3bc4d5d5e9c 100644 --- a/rllib/algorithms/marwil/tests/test_marwil.py +++ b/rllib/algorithms/marwil/tests/test_marwil.py @@ -115,7 +115,9 @@ def test_marwil_cont_actions_from_offline_file(self): config["evaluation_config"] = {"input": "sampler"} # Learn from offline data. config["input"] = [data_file] - config["input_evaluation"] = [] # disable (data has no action-probs) + config[ + "off_policy_estimation_methods" + ] = [] # disable (data has no action-probs) num_iterations = 3 # Test for all frameworks. diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 71659a873183..ed71c5cbba60 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -43,7 +43,10 @@ from ray.rllib.utils import force_list, merge_dicts, check_env from ray.rllib.utils.annotations import DeveloperAPI, ExperimentalAPI from ray.rllib.utils.debug import summarize, update_global_seed_if_necessary -from ray.rllib.utils.deprecation import Deprecated, deprecation_warning +from ray.rllib.utils.deprecation import ( + Deprecated, + deprecation_warning, +) from ray.rllib.utils.error import ERR_MSG_NO_GPUS, HOWTO_CHANGE_CONFIG from ray.rllib.utils.filter import get_filter, Filter from ray.rllib.utils.framework import try_import_tf, try_import_torch @@ -239,7 +242,7 @@ def __init__( input_creator: Callable[ [IOContext], InputReader ] = lambda ioctx: ioctx.default_sampler_input(), - input_evaluation: List[str] = frozenset([]), + off_policy_estimation_methods: List[str] = frozenset([]), output_creator: Callable[ [IOContext], OutputWriter ] = lambda ioctx: NoopOutput(), @@ -336,8 +339,8 @@ def __init__( DefaultCallbacks for training/policy/rollout-worker callbacks. input_creator: Function that returns an InputReader object for loading previous generated experiences. - input_evaluation: How to evaluate the policy performance. Setting this only - makes sense when the input is reading offline data. + off_policy_estimation_methods: How to evaluate the policy performance. + Setting this only makes sense when the input is reading offline data. Available options: - "simulation" (str): Run the environment in the background, but use this data for evaluation only and not for learning. @@ -696,22 +699,22 @@ def wrap(env): log_dir, policy_config, worker_index, self ) self.reward_estimators: List[OffPolicyEstimator] = [] - for method in input_evaluation: + for method in off_policy_estimation_methods: if method == "is": method = ImportanceSampling deprecation_warning( - old="config.input_evaluation=[is]", + old="config.off_policy_estimation_methods=[is]", new="from ray.rllib.offline.estimators import " - f"{method.__name__}; config.input_evaluation=" + f"{method.__name__}; config.off_policy_estimation_methods=" f"[{method.__name__}]", error=False, ) elif method == "wis": method = WeightedImportanceSampling deprecation_warning( - old="config.input_evaluation=[wis]", + old="config.off_policy_estimation_methods=[wis]", new="from ray.rllib.offline.estimators import " - f"{method.__name__}; config.input_evaluation=" + f"{method.__name__}; config.off_policy_estimation_methods=" f"[{method.__name__}]", error=False, ) @@ -753,7 +756,7 @@ def wrap(env): multiple_episodes_in_batch=pack, normalize_actions=normalize_actions, clip_actions=clip_actions, - blackhole_outputs="simulation" in input_evaluation, + blackhole_outputs="simulation" in off_policy_estimation_methods, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 14a035ac86b7..cb5be85c869a 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -609,9 +609,9 @@ def valid_module(class_path): ) if config["input"] == "sampler": - input_evaluation = [] + off_policy_estimation_methods = [] else: - input_evaluation = config["input_evaluation"] + off_policy_estimation_methods = config["off_policy_estimation_methods"] # Assert everything is correct in "multiagent" config dict (if given). ma_policies = config["multiagent"]["policies"] @@ -664,7 +664,7 @@ def valid_module(class_path): log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, - input_evaluation=input_evaluation, + off_policy_estimation_methods=off_policy_estimation_methods, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], diff --git a/rllib/examples/parallel_evaluation_and_training.py b/rllib/examples/parallel_evaluation_and_training.py index fca0eae84f67..10f08d70ea06 100644 --- a/rllib/examples/parallel_evaluation_and_training.py +++ b/rllib/examples/parallel_evaluation_and_training.py @@ -134,9 +134,6 @@ def on_train_result(self, *, trainer, result, **kwargs): # Evaluate every other training iteration (together # with every other call to Trainer.train()). "evaluation_interval": args.evaluation_interval, - "evaluation_config": { - "input_evaluation": ["is"], - }, # Run for n episodes/timesteps (properly distribute load amongst # all eval workers). The longer it takes to evaluate, the more sense # it makes to use `evaluation_parallel_to_training=True`. diff --git a/rllib/examples/serving/cartpole_server.py b/rllib/examples/serving/cartpole_server.py index 1d60946ddc97..e024258a0354 100755 --- a/rllib/examples/serving/cartpole_server.py +++ b/rllib/examples/serving/cartpole_server.py @@ -165,7 +165,7 @@ def _input(ioctx): # Use n worker processes to listen on different ports. "num_workers": args.num_workers, # Disable OPE, since the rollouts are coming from online clients. - "input_evaluation": [], + "off_policy_estimation_methods": [], # Create a "chatty" client/server or not. "callbacks": MyCallbacks if args.callbacks_verbose else None, # DL framework to use. diff --git a/rllib/examples/serving/unity3d_server.py b/rllib/examples/serving/unity3d_server.py index 636b703e703f..5e1132aa9149 100755 --- a/rllib/examples/serving/unity3d_server.py +++ b/rllib/examples/serving/unity3d_server.py @@ -132,7 +132,7 @@ def _input(ioctx): # Use n worker processes to listen on different ports. "num_workers": args.num_workers, # Disable OPE, since the rollouts are coming from online clients. - "input_evaluation": [], + "off_policy_estimation_methods": [], # Other settings. "train_batch_size": 256, "rollout_fragment_length": 20, diff --git a/rllib/offline/off_policy_estimator.py b/rllib/offline/off_policy_estimator.py index 9f1e2068daa1..14c0379559de 100644 --- a/rllib/offline/off_policy_estimator.py +++ b/rllib/offline/off_policy_estimator.py @@ -55,7 +55,7 @@ def create_from_io_context(cls, ioctx: IOContext) -> "OffPolicyEstimator": if len(keys) > 1: raise NotImplementedError( "Off-policy estimation is not implemented for multi-agent. " - "You can set `input_evaluation: []` to resolve this." + "You can set `off_policy_estimation_methods: []` to resolve this." ) policy = ioctx.worker.get_policy(keys[0]) return cls(policy, gamma) @@ -134,8 +134,8 @@ def check_can_estimate_for(self, batch: SampleBatchType) -> None: if isinstance(batch, MultiAgentBatch): raise ValueError( - "IS-estimation is not implemented for multi-agent batches. " - "You can set `input_evaluation: []` to resolve this." + "off-policy estimation is not implemented for multi-agent batches. " + "You can set `off_policy_estimation_methods: []` to resolve this." ) if "action_prob" not in batch: @@ -144,7 +144,7 @@ def check_can_estimate_for(self, batch: SampleBatchType) -> None: "include action probabilities (i.e., the policy is stochastic " "and emits the 'action_prob' key). For DQN this means using " "`exploration_config: {type: 'SoftQ'}`. You can also set " - "`input_evaluation: []` to disable estimation." + "`off_policy_estimation_methods: []` to disable estimation." ) @DeveloperAPI diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py index 73f4763d0151..06d88ef11807 100644 --- a/rllib/tests/test_io.py +++ b/rllib/tests/test_io.py @@ -98,7 +98,7 @@ def test_agent_input_dir(self): env="CartPole-v0", config={ "input": self.test_dir + fw, - "input_evaluation": [], + "off_policy_estimation_methods": [], "framework": fw, }, ) @@ -141,7 +141,7 @@ def test_agent_input_postprocessing_enabled(self): env="CartPole-v0", config={ "input": self.test_dir + fw, - "input_evaluation": [], + "off_policy_estimation_methods": [], "postprocess_inputs": True, # adds back 'advantages' "framework": fw, }, @@ -158,7 +158,7 @@ def test_agent_input_eval_sim(self): env="CartPole-v0", config={ "input": self.test_dir + fw, - "input_evaluation": ["simulation"], + "off_policy_estimation_methods": ["simulation"], "framework": fw, }, ) @@ -176,7 +176,7 @@ def test_agent_input_list(self): env="CartPole-v0", config={ "input": glob.glob(self.test_dir + fw + "/*.json"), - "input_evaluation": [], + "off_policy_estimation_methods": [], "rollout_fragment_length": 99, "framework": fw, }, @@ -196,7 +196,7 @@ def test_agent_input_dict(self): "sampler": 0.9, }, "train_batch_size": 2000, - "input_evaluation": [], + "off_policy_estimation_methods": [], "framework": fw, }, ) @@ -234,7 +234,7 @@ def test_multi_agent(self): config={ "num_workers": 0, "input": self.test_dir, - "input_evaluation": ["simulation"], + "off_policy_estimation_methods": ["simulation"], "train_batch_size": 2000, "multiagent": { "policies": {"policy_1", "policy_2"}, @@ -276,7 +276,7 @@ def input_creator(ioctx: IOContext) -> InputReader: config={ "input": input_procedure, "input_config": {"input_files": self.test_dir + fw}, - "input_evaluation": [], + "off_policy_estimation_methods": [], "framework": fw, }, ) diff --git a/rllib/tests/test_nested_action_spaces.py b/rllib/tests/test_nested_action_spaces.py index a2b6dd07e7e5..49230ed74202 100644 --- a/rllib/tests/test_nested_action_spaces.py +++ b/rllib/tests/test_nested_action_spaces.py @@ -69,7 +69,7 @@ def test_nested_action_spaces(self): config["output"] = tmp_dir # Switch off OPE as we don't write action-probs. # TODO: We should probably always write those if `output` is given. - config["input_evaluation"] = [] + config["off_policy_estimation_methods"] = [] # Pretend actions in offline files are already normalized. config["actions_in_input_normalized"] = True