From b72dc5bee2a6a493216a429a09cba1d8c45bb83f Mon Sep 17 00:00:00 2001
From: Jun Gong <jungong@anyscale.com>
Date: Wed, 9 Feb 2022 01:58:58 -0800
Subject: [PATCH] [RLlib] Remove Bandits RecSim example for now, since it
 doesn't really work.

Revert "[RLlib] Add an env wrapper so RecSim works with our Bandits agent. (#22028)"

This reverts commit 9c95b9a5fae5882b6b89cd80cbc2a55b4b50c405.
---
 rllib/BUILD                                   |  8 ---
 rllib/env/wrappers/recsim.py                  | 56 +----------------
 .../env/wrappers/tests/test_recsim_wrapper.py |  5 --
 .../bandit/tune_lin_ucb_train_recsim_env.py   | 60 -------------------
 4 files changed, 2 insertions(+), 127 deletions(-)
 delete mode 100644 rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py

diff --git a/rllib/BUILD b/rllib/BUILD
index ad35ddf0fcd3..61ad0c0a43a2 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2840,14 +2840,6 @@ py_test(
     srcs = ["examples/bandit/tune_lin_ucb_train_recommendation.py"],
 )
 
-py_test(
-    name = "examples/bandit/tune_lin_ucb_train_recsim_env",
-    main = "examples/bandit/tune_lin_ucb_train_recsim_env.py",
-    tags = ["team:ml", "examples", ],
-    size = "small",
-    srcs = ["examples/bandit/tune_lin_ucb_train_recsim_env.py"],
-)
-
 # --------------------------------------------------------------------
 # examples/documentation directory
 #
diff --git a/rllib/env/wrappers/recsim.py b/rllib/env/wrappers/recsim.py
index 56ef299dafce..6b2c76574fad 100644
--- a/rllib/env/wrappers/recsim.py
+++ b/rllib/env/wrappers/recsim.py
@@ -64,49 +64,6 @@ def observation(self, obs):
         return new_obs
 
 
-class RecSimObservationBanditWrapper(gym.ObservationWrapper):
-    """Fix RecSim environment's observation format
-
-    RecSim's observations are keyed by document IDs, and nested under
-    "doc" key.
-    Our Bandits agent expects the observations to be flat 2D array
-    and under "item" key.
-
-    This environment wrapper converts obs into the right format.
-    """
-
-    def __init__(self, env: gym.Env):
-        super().__init__(env)
-        obs_space = self.env.observation_space
-
-        num_items = len(obs_space["doc"])
-        embedding_dim = next(iter(obs_space["doc"].values())).shape[-1]
-        self.observation_space = Dict(
-            OrderedDict(
-                [
-                    ("user", obs_space["user"]),
-                    (
-                        "item",
-                        gym.spaces.Box(
-                            low=-np.ones((num_items, embedding_dim)),
-                            high=np.ones((num_items, embedding_dim)),
-                        ),
-                    ),
-                    ("response", obs_space["response"]),
-                ]
-            )
-        )
-        self._sampled_obs = self.observation_space.sample()
-
-    def observation(self, obs):
-        new_obs = OrderedDict()
-        new_obs["user"] = obs["user"]
-        new_obs["item"] = np.vstack(list(obs["doc"].values()))
-        new_obs["response"] = obs["response"]
-        new_obs = convert_element_to_space_type(new_obs, self._sampled_obs)
-        return new_obs
-
-
 class RecSimResetWrapper(gym.Wrapper):
     """Fix RecSim environment's reset() and close() function
 
@@ -160,9 +117,7 @@ def action(self, action: int) -> List[int]:
 
 
 def recsim_gym_wrapper(
-    recsim_gym_env: gym.Env,
-    convert_to_discrete_action_space: bool = False,
-    wrap_for_bandits: bool = False,
+    recsim_gym_env: gym.Env, convert_to_discrete_action_space: bool = False
 ) -> gym.Env:
     """Makes sure a RecSim gym.Env can ba handled by RLlib.
 
@@ -186,8 +141,6 @@ def recsim_gym_wrapper(
             such as RLlib's DQN. If None, `convert_to_discrete_action_space`
             may also be provided via the EnvContext (config) when creating an
             actual env instance.
-        wrap_for_bandits: Bool indicating, whether this RecSim env should be
-            wrapped for use with our Bandits agent.
 
     Returns:
         An RLlib-ready gym.Env instance.
@@ -196,8 +149,6 @@ def recsim_gym_wrapper(
     env = RecSimObservationSpaceWrapper(env)
     if convert_to_discrete_action_space:
         env = MultiDiscreteToDiscreteActionWrapper(env)
-    if wrap_for_bandits:
-        env = RecSimObservationBanditWrapper(env)
     return env
 
 
@@ -235,7 +186,6 @@ def __init__(self, config: Optional[EnvContext] = None):
                 "resample_documents": True,
                 "seed": 0,
                 "convert_to_discrete_action_space": False,
-                "wrap_for_bandits": False,
             }
             if config is None or isinstance(config, dict):
                 config = EnvContext(config or default_config, worker_index=0)
@@ -260,9 +210,7 @@ def __init__(self, config: Optional[EnvContext] = None):
             # Fix observation space and - if necessary - convert to discrete
             # action space (from multi-discrete).
             env = recsim_gym_wrapper(
-                gym_env,
-                config["convert_to_discrete_action_space"],
-                config["wrap_for_bandits"],
+                gym_env, env_ctx["convert_to_discrete_action_space"]
             )
             # Call the super (Wrapper constructor) passing it the created env.
             super().__init__(env=env)
diff --git a/rllib/env/wrappers/tests/test_recsim_wrapper.py b/rllib/env/wrappers/tests/test_recsim_wrapper.py
index b34e10cbcca2..14a9bc395db4 100644
--- a/rllib/env/wrappers/tests/test_recsim_wrapper.py
+++ b/rllib/env/wrappers/tests/test_recsim_wrapper.py
@@ -28,11 +28,6 @@ def test_action_space_conversion(self):
         new_obs, _, _, _ = env.step(action)
         self.assertTrue(env.observation_space.contains(new_obs))
 
-    def test_bandits_observation_space_conversion(self):
-        env = InterestEvolutionRecSimEnv({"wrap_for_bandits": True})
-        # "item" of observation space is a Box space.
-        self.assertIsInstance(env.observation_space["item"], gym.spaces.Box)
-
     def test_double_action_space_conversion_raises_exception(self):
         env = InterestEvolutionRecSimEnv({"convert_to_discrete_action_space": True})
         with self.assertRaises(UnsupportedSpaceException):
diff --git a/rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py b/rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py
deleted file mode 100644
index 016f9c8f4563..000000000000
--- a/rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Example of using LinUCB on a RecSim environment. """
-
-from matplotlib import pyplot as plt
-import pandas as pd
-import time
-
-from ray import tune
-import ray.rllib.examples.env.recsim_recommender_system_envs  # noqa
-
-
-if __name__ == "__main__":
-    ray.init()
-
-    config = {
-        # "RecSim-v1" is a pre-registered RecSim env.
-        # Alternatively, you can do:
-        # `from ray.rllib.examples.env.recsim_recommender_system_envs import ...`
-        # - LongTermSatisfactionRecSimEnv
-        # - InterestExplorationRecSimEnv
-        # - InterestEvolutionRecSimEnv
-        # Then: "env": [the imported RecSim class]
-        "env": "RecSim-v1",
-        "env_config": {
-            "convert_to_discrete_action_space": True,
-            "wrap_for_bandits": True,
-        },
-    }
-
-    # Actual training_iterations will be 10 * timesteps_per_iteration
-    # (100 by default) = 2,000
-    training_iterations = 10
-
-    print("Running training for %s time steps" % training_iterations)
-
-    start_time = time.time()
-    analysis = tune.run(
-        "BanditLinUCB",
-        config=config,
-        stop={"training_iteration": training_iterations},
-        num_samples=1,
-        checkpoint_at_end=False,
-    )
-
-    print("The trials took", time.time() - start_time, "seconds\n")
-
-    # Analyze cumulative regrets of the trials
-    frame = pd.DataFrame()
-    for key, df in analysis.trial_dataframes.items():
-        frame = frame.append(df, ignore_index=True)
-    x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(
-        ["mean", "max", "min", "std"]
-    )
-
-    plt.plot(x["mean"])
-    plt.fill_between(
-        x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2
-    )
-    plt.title("Episode reward mean")
-    plt.xlabel("Training steps")
-    plt.show()