[RLlib] Eps greedy ope (#28837)

* 1. Introduced new abstraction: OfflineEvaluator that is the parent of OPE and feature importance 2. introduced estimate_multi_step vs. estimate_single_step Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * algorithm ope evaluation is now able to skip split_by_episode Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * lint Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * lint Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * fixed some unittests Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * added eps greedy exploration to ope methods Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * wip Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * lint Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * wip Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * wip Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * fixed dm and dr variance issues Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * lint Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * cleaned up the inheritance Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * lint Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * lint Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * fixed test Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * nit Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * fixed nits Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * fixed the typos Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * nit Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * wip Signed-off-by: Kourosh Hakhamaneshi <[email protected]> * wip Signed-off-by: Kourosh Hakhamaneshi <[email protected]> Signed-off-by: Kourosh Hakhamaneshi <[email protected]>
ray-project · Sep 29, 2022 · c1e0d39 · c1e0d39
1 parent 6dbc116
commit c1e0d39
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 38 deletions.
diff --git a/rllib/offline/estimators/direct_method.py b/rllib/offline/estimators/direct_method.py
@@ -34,22 +34,26 @@ def __init__(
         self,
         policy: Policy,
         gamma: float,
+        epsilon_greedy: float = 0.0,
         q_model_config: Optional[Dict] = None,
     ):
         """Initializes a Direct Method OPE Estimator.
 
         Args:
             policy: Policy to evaluate.
             gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+                policy during deployment. With 1-epsilon_greedy we act according the
+                target policy.
             q_model_config: Arguments to specify the Q-model. Must specify
-            a `type` key pointing to the Q-model class.
-            This Q-model is trained in the train() method and is used
-            to compute the state-value estimates for the DirectMethod estimator.
-            It must implement `train` and `estimate_v`.
-            TODO (Rohan138): Unify this with RLModule API.
+                a `type` key pointing to the Q-model class.
+                This Q-model is trained in the train() method and is used
+                to compute the state-value estimates for the DirectMethod estimator.
+                It must implement `train` and `estimate_v`.
+                TODO (Rohan138): Unify this with RLModule API.
         """
 
-        super().__init__(policy, gamma)
+        super().__init__(policy, gamma, epsilon_greedy)
 
         q_model_config = q_model_config or {}
         model_cls = q_model_config.pop("type", FQETorchModel)

diff --git a/rllib/offline/estimators/doubly_robust.py b/rllib/offline/estimators/doubly_robust.py
@@ -7,7 +7,6 @@
 from ray.rllib.utils.annotations import DeveloperAPI, override
 from ray.rllib.utils.typing import SampleBatchType
 from ray.rllib.utils.numpy import convert_to_numpy
-from ray.rllib.utils.policy import compute_log_likelihoods_from_input_dict
 
 from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
 from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel
@@ -46,23 +45,27 @@ def __init__(
         self,
         policy: Policy,
         gamma: float,
+        epsilon_greedy: float = 0.0,
         q_model_config: Optional[Dict] = None,
     ):
         """Initializes a Doubly Robust OPE Estimator.
 
         Args:
             policy: Policy to evaluate.
             gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+                policy during deployment. With 1-epsilon_greedy we act
+                according the target policy.
             q_model_config: Arguments to specify the Q-model. Must specify
-            a `type` key pointing to the Q-model class.
-            This Q-model is trained in the train() method and is used
-            to compute the state-value and Q-value estimates
-            for the DoublyRobust estimator.
-            It must implement `train`, `estimate_q`, and `estimate_v`.
-            TODO (Rohan138): Unify this with RLModule API.
+                a `type` key pointing to the Q-model class.
+                This Q-model is trained in the train() method and is used
+                to compute the state-value and Q-value estimates
+                for the DoublyRobust estimator.
+                It must implement `train`, `estimate_q`, and `estimate_v`.
+                TODO (Rohan138): Unify this with RLModule API.
         """
 
-        super().__init__(policy, gamma)
+        super().__init__(policy, gamma, epsilon_greedy)
         q_model_config = q_model_config or {}
         model_cls = q_model_config.pop("type", FQETorchModel)
 
@@ -83,8 +86,7 @@ def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
         estimates_per_epsiode = {}
 
         rewards, old_prob = episode["rewards"], episode["action_prob"]
-        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, episode)
-        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+        new_prob = self.compute_action_probs(episode)
 
         v_behavior = 0.0
         v_target = 0.0
@@ -113,8 +115,7 @@ def estimate_on_single_step_samples(
         estimates_per_epsiode = {}
 
         rewards, old_prob = batch["rewards"], batch["action_prob"]
-        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
-        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+        new_prob = self.compute_action_probs(batch)
 
         q_values = self.model.estimate_q(batch)
         q_values = convert_to_numpy(q_values)

diff --git a/rllib/offline/estimators/importance_sampling.py b/rllib/offline/estimators/importance_sampling.py
@@ -1,10 +1,7 @@
 from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.numpy import convert_to_numpy
-from ray.rllib.utils.policy import compute_log_likelihoods_from_input_dict
 from typing import Dict, List
-import numpy as np
 
 
 @DeveloperAPI
@@ -28,8 +25,7 @@ def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, float]:
         estimates_per_epsiode = {}
 
         rewards, old_prob = episode["rewards"], episode["action_prob"]
-        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, episode)
-        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+        new_prob = self.compute_action_probs(episode)
 
         # calculate importance ratios
         p = []
@@ -59,8 +55,7 @@ def estimate_on_single_step_samples(
         estimates_per_epsiode = {}
 
         rewards, old_prob = batch["rewards"], batch["action_prob"]
-        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
-        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+        new_prob = self.compute_action_probs(batch)
 
         weights = new_prob / old_prob
         v_behavior = rewards

diff --git a/rllib/offline/estimators/off_policy_estimator.py b/rllib/offline/estimators/off_policy_estimator.py
@@ -1,3 +1,4 @@
+import gym
 import numpy as np
 import tree
 from typing import Dict, Any, List
@@ -12,6 +13,7 @@
 from ray.rllib.utils.policy import compute_log_likelihoods_from_input_dict
 from ray.rllib.utils.annotations import (
     DeveloperAPI,
+    ExperimentalAPI,
     OverrideToImplementCustomLogic,
 )
 from ray.rllib.utils.deprecation import Deprecated
@@ -27,15 +29,25 @@ class OffPolicyEstimator(OfflineEvaluator):
     """Interface for an off policy estimator for counterfactual evaluation."""
 
     @DeveloperAPI
-    def __init__(self, policy: Policy, gamma: float = 0.0):
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float = 0.0,
+        epsilon_greedy: float = 0.0,
+    ):
         """Initializes an OffPolicyEstimator instance.
 
         Args:
             policy: Policy to evaluate.
             gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+            policy during deployment. With 1-epsilon_greedy we act according the target
+            policy.
+            # TODO (kourosh): convert the input parameters to a config dict.
         """
         self.policy = policy
         self.gamma = gamma
+        self.epsilon_greedy = epsilon_greedy
 
     @DeveloperAPI
     def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
@@ -228,6 +240,22 @@ def check_action_prob_in_batch(self, batch: SampleBatchType) -> None:
                 "`off_policy_estimation_methods: {}` to disable estimation."
             )
 
+    @ExperimentalAPI
+    def compute_action_probs(self, batch: SampleBatch):
+        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
+        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+
+        if self.epsilon_greedy > 0.0:
+            if not isinstance(self.policy.action_space, gym.spaces.Discrete):
+                raise ValueError(
+                    "Evaluation with epsilon-greedy exploration is only supported "
+                    "with discrete action spaces."
+                )
+            eps = self.epsilon_greedy
+            new_prob = new_prob * (1 - eps) + eps / self.policy.action_space.n
+
+        return new_prob
+
     @DeveloperAPI
     def train(self, batch: SampleBatchType) -> Dict[str, Any]:
         """Train a model for Off-Policy Estimation.

diff --git a/rllib/offline/estimators/tests/test_ope.py b/rllib/offline/estimators/tests/test_ope.py
@@ -64,10 +64,10 @@ def setUpClass(cls):
                 evaluation_num_workers=1,
                 evaluation_duration_unit="episodes",
                 off_policy_estimation_methods={
-                    "is": {"type": ImportanceSampling},
-                    "wis": {"type": WeightedImportanceSampling},
-                    "dm_fqe": {"type": DirectMethod},
-                    "dr_fqe": {"type": DoublyRobust},
+                    "is": {"type": ImportanceSampling, "epsilon_greedy": 0.1},
+                    "wis": {"type": WeightedImportanceSampling, "epsilon_greedy": 0.1},
+                    "dm_fqe": {"type": DirectMethod, "epsilon_greedy": 0.1},
+                    "dr_fqe": {"type": DoublyRobust, "epsilon_greedy": 0.1},
                 },
             )
         )

diff --git a/rllib/offline/estimators/weighted_importance_sampling.py b/rllib/offline/estimators/weighted_importance_sampling.py
@@ -3,10 +3,8 @@
 
 from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.policy import compute_log_likelihoods_from_input_dict
 from ray.rllib.policy import Policy
 from ray.rllib.utils.annotations import override, DeveloperAPI
-from ray.rllib.utils.numpy import convert_to_numpy
 
 
 @DeveloperAPI
@@ -29,8 +27,8 @@ class WeightedImportanceSampling(OffPolicyEstimator):
     For more information refer to https://arxiv.org/pdf/1911.06854.pdf"""
 
     @override(OffPolicyEstimator)
-    def __init__(self, policy: Policy, gamma: float):
-        super().__init__(policy, gamma)
+    def __init__(self, policy: Policy, gamma: float, epsilon_greedy: float = 0.0):
+        super().__init__(policy, gamma, epsilon_greedy)
         # map from time to cummulative propensity values
         self.cummulative_ips_values = []
         # map from time to number of episodes that reached this time
@@ -70,8 +68,7 @@ def estimate_on_single_step_samples(
     ) -> Dict[str, List[float]]:
         estimates_per_epsiode = {}
         rewards, old_prob = batch["rewards"], batch["action_prob"]
-        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
-        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+        new_prob = self.compute_action_probs(batch)
 
         weights = new_prob / old_prob
         v_behavior = rewards
@@ -95,8 +92,7 @@ def on_before_split_batch_by_episode(
     @override(OffPolicyEstimator)
     def peek_on_single_episode(self, episode: SampleBatch) -> None:
         old_prob = episode["action_prob"]
-        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, episode)
-        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+        new_prob = self.compute_action_probs(episode)
 
         # calculate importance ratios
         episode_p = []