ray-project · gjoliver · Nov 16, 2022 · Nov 8, 2022 · Nov 8, 2022 · Nov 9, 2022
@@ -26,6 +26,7 @@
     Union,
 )
 from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+from ray.rllib.offline.offline_evaluation_utils import remove_time_dim
 import tree
 
 import ray
@@ -52,7 +53,7 @@
 from ray.rllib.execution.parallel_requests import AsyncRequestsManager
 from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
 from ray.rllib.execution.train_ops import multi_gpu_train_one_step, train_one_step
-from ray.rllib.offline import get_offline_io_resource_bundles
+from ray.rllib.offline import get_dataset_and_shards
 from ray.rllib.offline.estimators import (
     OffPolicyEstimator,
     ImportanceSampling,
@@ -592,7 +593,8 @@ def setup(self, config: AlgorithmConfig) -> None:
 
         # Evaluation WorkerSet setup.
         # User would like to setup a separate evaluation worker set.
-        if self.config.evaluation_num_workers > 0 or self.config.evaluation_interval:
+        # Note: We skip workerset creation if we need to do offline evaluation
+        if self._should_create_evaluation_rollout_workers(self.evaluation_config):
             _, env_creator = self._get_env_id_and_creator(
                 self.evaluation_config.env, self.evaluation_config
             )
@@ -620,6 +622,26 @@ def setup(self, config: AlgorithmConfig) -> None:
                 )
                 self._evaluation_weights_seq_number = 0
 
+        self.evaluation_dataset = None
+        if (
+            self.evaluation_config.off_policy_estimation_methods
+            and not self.evaluation_config.ope_split_batch_by_episode
+        ):
+            # the num worker is set to 0 to avoid creating shards. The dataset will not
+            # be repartioned to num_workers blocks.
+            logger.info("Creating evaluation dataset ...")
+            ds, _ = get_dataset_and_shards(self.evaluation_config, num_workers=0)
+
+            # Dataset should be in form of one episode per row. in case of bandits each
+            # row is just one time step. To make the computation more efficient later
+            # we remove the time dimension here.
+            parallelism = self.evaluation_config.evaluation_num_workers or 1
+            batch_size = max(ds.count() // parallelism, 1)
+            self.evaluation_dataset = ds.map_batches(
+                remove_time_dim, batch_size=batch_size
+            )
+            logger.info("Evaluation dataset created")
+
         self.reward_estimators: Dict[str, OffPolicyEstimator] = {}
         ope_types = {
             "is": ImportanceSampling,
@@ -654,7 +676,7 @@ def setup(self, config: AlgorithmConfig) -> None:
                 raise ValueError(
                     f"Unknown off_policy_estimation type: {method_type}! Must be "
                     "either a class path or a sub-class of ray.rllib."
-                    "offline.estimators.off_policy_estimator::OffPolicyEstimator"
+                    "offline.offline_evaluator::OfflineEvaluator"
                 )
 
         # Run `on_algorithm_init` callback after initialization is done.
@@ -803,6 +825,9 @@ def evaluate(
         # Call the `_before_evaluate` hook.
         self._before_evaluate()
 
+        if self.evaluation_dataset is not None:
+            return {"evaluation": self._run_offline_evaluation()}
+
         # Sync weights to the evaluation WorkerSet.
         if self.evaluation_workers is not None:
             self.evaluation_workers.sync_weights(
@@ -1976,50 +2001,62 @@ def default_resource_request(
         # workers to determine their CPU/GPU resource needs.
 
         # Convenience config handles.
-        default_config = cls.get_default_config()
-        # TODO: Have to make this work for now for AlgorithmConfigs (returned by
-        #  get_default_config(). Use only AlgorithmConfigs once all Algorithms
-        #  return an AlgorothmConfig from their get_default_config() method.
-        if not isinstance(default_config, dict):
-            default_config = default_config.to_dict()
-        cf = dict(default_config, **config)
-        eval_cf = cf["evaluation_config"] or {}
+        cf = cls.get_default_config().update_from_dict(config)
+        cf.validate()
+        cf.freeze()
+
+        # get evaluation config
+        eval_cf = cf.get_evaluation_config_object()
+        eval_cf.validate()
+        eval_cf.freeze()
 
+        # resources for local worker
         local_worker = {
-            "CPU": cf["num_cpus_for_driver"],
-            "GPU": 0 if cf["_fake_gpus"] else cf["num_gpus"],
+            "CPU": cf.num_cpus_for_local_worker,
+            "GPU": 0 if cf._fake_gpus else cf.num_gpus,
         }
+
+        bundles = [local_worker]
+
+        # resources for rollout env samplers
         rollout_workers = [
             {
-                "CPU": cf["num_cpus_per_worker"],
-                "GPU": cf["num_gpus_per_worker"],
-                **cf["custom_resources_per_worker"],
+                "CPU": cf.num_cpus_per_worker,
+                "GPU": cf.num_gpus_per_worker,
+                **cf.custom_resources_per_worker,
             }
-            for _ in range(cf["num_workers"])
+            for _ in range(cf.num_rollout_workers)
         ]
 
-        bundles = [local_worker] + rollout_workers
-
-        if cf["evaluation_interval"]:
+        # resources for evaluation env samplers or datasets (if any)
+        if cls._should_create_evaluation_rollout_workers(eval_cf):
             # Evaluation workers.
             # Note: The local eval worker is located on the driver CPU.
-            bundles += [
+            evaluation_bundle = [
                 {
-                    "CPU": eval_cf.get(
-                        "num_cpus_per_worker", cf["num_cpus_per_worker"]
-                    ),
-                    "GPU": eval_cf.get(
-                        "num_gpus_per_worker", cf["num_gpus_per_worker"]
-                    ),
-                    **eval_cf.get(
-                        "custom_resources_per_worker", cf["custom_resources_per_worker"]
-                    ),
+                    "CPU": eval_cf.num_cpus_per_worker,
+                    "GPU": eval_cf.num_gpus_per_worker,
+                    **eval_cf.custom_resources_per_worker,
                 }
-                for _ in range(cf["evaluation_num_workers"])
+                for _ in range(eval_cf.evaluation_num_workers)
             ]
-
-        # In case our I/O reader/writer requires conmpute resources.
-        bundles += get_offline_io_resource_bundles(cf)
+        else:
+            # resources for offline dataset readers during evaluation
+            # Note (Kourosh): we should not claim extra workers for
+            # training on the offline dataset, since rollout workers have already
+            # claimed it.
+            # Another Note (Kourosh): dataset reader will not use placement groups so
+            # whatever we specify here won't matter because dataset won't even use it.
+            # Disclaimer: using ray dataset in tune may cause deadlock when multiple
+            # tune trials get scheduled on the same node and do not leave any spare
+            # resources for dataset operations. The workaround is to limit the
+            # max_concurrent trials so that some spare cpus are left for dataset
+            # operations. This behavior should get fixed by the dataset team. more info
+            # found here:
+            # https://docs.ray.io/en/master/data/dataset-internals.html#datasets-tune
+            evaluation_bundle = []
+
+        bundles += rollout_workers + evaluation_bundle
 
         # Return PlacementGroupFactory containing all needed resources
         # (already properly defined as device bundles).
@@ -2632,6 +2669,7 @@ def _run_one_evaluation(
         Returns:
             The results dict from the evaluation call.
         """
+
         eval_results = {
             "evaluation": {
                 "episode_reward_max": np.nan,
@@ -2715,6 +2753,46 @@ def _run_one_training_iteration_and_evaluation_in_parallel(
 
         return results, train_iter_ctx
 
+    def _run_offline_evaluation(self):
+        """Runs offline evaluation via `OfflineEvaluator.estimate_on_dataset()` API.
+
+        This method will be used when `evaluation_dataset` is provided.
+        Note: This will only work if the policy is a single agent policy.
+
+        Returns:
+            The results dict from the offline evaluation call.
+        """
+        assert len(self.workers.local_worker().policy_map) == 1
+
+        parallelism = self.evaluation_config.evaluation_num_workers or 1
+        offline_eval_results = {"off_policy_estimator": {}}
+        for evaluator_name, offline_evaluator in self.reward_estimators.items():
+            offline_eval_results["off_policy_estimator"][
+                evaluator_name
+            ] = offline_evaluator.estimate_on_dataset(
+                self.evaluation_dataset,
+                n_parallelism=parallelism,
+            )
+        return offline_eval_results
+
+    @classmethod
+    def _should_create_evaluation_rollout_workers(cls, eval_config: "AlgorithmConfig"):
+        """Determines whether we need to create evaluation workers.
+
+        Returns False if we need to run offline evaluation
+        (with ope.estimate_on_dastaset API) or when local worker is to be used for
+        evaluation. Note: We only use estimate_on_dataset API with bandits for now.
+        That is when ope_split_batch_by_episode is False. TODO: In future we will do
+        the same for episodic RL OPE.
+        """
+        run_offline_evaluation = (
+            eval_config.get("off_policy_estimation_methods")
+            and not eval_config.ope_split_batch_by_episode
+        )
+        return not run_offline_evaluation and (
+            eval_config.evaluation_num_workers > 0 or eval_config.evaluation_interval
+        )
+
     @staticmethod
     def _automatic_evaluation_duration_fn(
         unit, num_eval_workers, eval_cfg, train_future, num_units_done

@@ -647,7 +647,10 @@ def validate(self) -> None:
                 from ray.rllib.policy.dynamic_tf_policy import DynamicTFPolicy
                 from ray.rllib.policy.torch_policy import TorchPolicy
 
-                default_policy_cls = self.algo_class.get_default_policy_class(self)
+                default_policy_cls = None
+                if self.algo_class:
+                    default_policy_cls = self.algo_class.get_default_policy_class(self)
+
                 policies = self.policies
                 policy_specs = (
                     [
@@ -680,6 +683,29 @@ def validate(self) -> None:
                     f"config.framework({self.framework_str})!"
                 )
 
+        if self.input_ == "sampler" and self.off_policy_estimation_methods:
+            raise ValueError(
+                "Off-policy estimation methods can only be used if the input is a "
+                "dataset. We currently do not support applying off_policy_esitmation "
+                "method on a sampler input."
+            )
+
+        if self.input_ == "dataset":
+            # if we need to read a ray dataset set the parallelism and
+            # num_cpus_per_read_task from rollout worker settings
+            self.input_config["num_cpus_per_read_task"] = self.num_cpus_per_worker
+            if self.in_evaluation:
+                # If using dataset for evaluation, the parallelism gets set to
+                # evaluation_num_workers for backward compatibility and num_cpus gets
+                # set to num_cpus_per_worker from rollout worker. User only needs to
+                # set evaluation_num_workers.
+                self.input_config["parallelism"] = self.evaluation_num_workers or 1
+            else:
+                # If using dataset for training, the parallelism and num_cpus gets set
+                # based on rollout worker parameters. This is for backwards
+                # compatibility for now. User only needs to set num_rollout_workers.
+                self.input_config["parallelism"] = self.num_rollout_workers or 1
+
     def build(
         self,
         env: Optional[Union[str, EnvType]] = None,
@@ -1460,8 +1486,10 @@ def offline_data(
                 - A callable that takes an `IOContext` object as only arg and returns a
                 ray.rllib.offline.InputReader.
                 - A string key that indexes a callable with tune.registry.register_input
-            input_config: Arguments accessible from the IOContext for configuring custom
-                input.
+            input_config: Arguments that describe the settings for reading the input.
+                If input is `sample`, this will be environment configuation, e.g.
+                `env_name` and `env_config`, etc. See `EnvContext` for more info.
+                If the input is `dataset`, this will be e.g. `format`, `path`.
             actions_in_input_normalized: True, if the actions in a given offline "input"
                 are already normalized (between -1.0 and 1.0). This is usually the case
                 when the offline file has been generated by another RLlib algorithm
@@ -1497,6 +1525,25 @@ def offline_data(
         if input_ is not NotProvided:
             self.input_ = input_
         if input_config is not NotProvided:
+            if not isinstance(input_config, dict):
+                raise ValueError(
+                    f"input_config must be a dict, got {type(input_config)}."
+                )
+            # TODO (Kourosh) Once we use a complete sepration between rollout worker
+            # and input dataset reader we can remove this.
+            # For now Error out if user attempts to set these parameters.
+            msg = "{} should not be set in the input_config. RLlib will use {} instead."
+            if input_config.get("num_cpus_per_read_task") is not None:
+                raise ValueError(
+                    msg.format("num_cpus_per_read_task", "num_cpus_per_worker")
+                )
+            if input_config.get("parallelism") is not None:
+                if self.in_evaluation:
+                    raise ValueError(
+                        msg.format("parallelism", "evaluation_num_workers")
+                    )
+                else:
+                    raise ValueError(msg.format("parallelism", "num_rollout_workers"))
             self.input_config = input_config
         if actions_in_input_normalized is not NotProvided:
             self.actions_in_input_normalized = actions_in_input_normalized

@@ -56,11 +56,6 @@ def __init__(self, algo_class=None):
         # __sphinx_doc_end__
         # fmt: on
 
-        # TODO: Remove this when the off_policy_estimation_methods
-        # default config is removed from MARWIL
-        # No off-policy estimation.
-        self.off_policy_estimation_methods = {}
-
     @override(MARWILConfig)
     def validate(self) -> None:
         super().validate()

@@ -270,12 +270,6 @@ def validate(self) -> None:
                 f"Try setting config.rollouts(rollout_fragment_length={self.n_step})."
             )
 
-        if self.model["custom_model"]:
-            raise ValueError(
-                "Try setting config.training(use_state_preprocessor=True) "
-                "since a custom model was specified."
-            )
-
         if self.grad_clip is not None and self.grad_clip <= 0.0:
             raise ValueError("`grad_clip` value must be > 0.0!")
 

@@ -9,13 +9,9 @@
     multi_gpu_train_one_step,
     train_one_step,
 )
-from ray.rllib.offline.estimators import ImportanceSampling, WeightedImportanceSampling
 from ray.rllib.policy.policy import Policy
 from ray.rllib.utils.annotations import override
-from ray.rllib.utils.deprecation import (
-    Deprecated,
-    deprecation_warning,
-)
+from ray.rllib.utils.deprecation import Deprecated, deprecation_warning
 from ray.rllib.utils.metrics import (
     NUM_AGENT_STEPS_SAMPLED,
     NUM_ENV_STEPS_SAMPLED,
@@ -100,13 +96,6 @@ def __init__(self, algo_class=None):
         self.train_batch_size = 2000
         # __sphinx_doc_end__
         # fmt: on
-
-        # TODO: Delete this and change off_policy_estimation_methods to {}
-        # Also remove the same section from BC
-        self.off_policy_estimation_methods = {
-            "is": {"type": ImportanceSampling},
-            "wis": {"type": WeightedImportanceSampling},
-        }
         self._set_off_policy_estimation_methods = False
 
     @override(AlgorithmConfig)
@@ -169,7 +158,6 @@ def evaluation(
         **kwargs,
     ) -> "MARWILConfig":
         """Sets the evaluation related configuration.
-
         Returns:
             This updated AlgorithmConfig object.
         """
@@ -190,9 +178,9 @@ def build(
     ) -> "Algorithm":
         if not self._set_off_policy_estimation_methods:
             deprecation_warning(
-                old="MARWIL currently uses off_policy_estimation_methods: "
-                f"{self.off_policy_estimation_methods} by default. This will"
-                "change to off_policy_estimation_methods: {} in a future release."
+                old="MARWIL used to have off_policy_estimation_methods "
+                "is and wis by default. This has"
+                "changed to off_policy_estimation_methods: \{\}."
                 "If you want to use an off-policy estimator, specify it in"
                 ".evaluation(off_policy_estimation_methods=...)",
                 error=False,

@@ -24,7 +24,7 @@
 class TestMARWIL(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(num_cpus=4)
+        ray.init()
 
     @classmethod
     def tearDownClass(cls):

@@ -20,7 +20,7 @@
 class TestAlgorithm(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(num_cpus=6)
+        ray.init()
 
     @classmethod
     def tearDownClass(cls):