From b72dc5bee2a6a493216a429a09cba1d8c45bb83f Mon Sep 17 00:00:00 2001 From: Jun Gong Date: Wed, 9 Feb 2022 01:58:58 -0800 Subject: [PATCH] [RLlib] Remove Bandits RecSim example for now, since it doesn't really work. Revert "[RLlib] Add an env wrapper so RecSim works with our Bandits agent. (#22028)" This reverts commit 9c95b9a5fae5882b6b89cd80cbc2a55b4b50c405. --- rllib/BUILD | 8 --- rllib/env/wrappers/recsim.py | 56 +---------------- .../env/wrappers/tests/test_recsim_wrapper.py | 5 -- .../bandit/tune_lin_ucb_train_recsim_env.py | 60 ------------------- 4 files changed, 2 insertions(+), 127 deletions(-) delete mode 100644 rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py diff --git a/rllib/BUILD b/rllib/BUILD index ad35ddf0fcd3..61ad0c0a43a2 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2840,14 +2840,6 @@ py_test( srcs = ["examples/bandit/tune_lin_ucb_train_recommendation.py"], ) -py_test( - name = "examples/bandit/tune_lin_ucb_train_recsim_env", - main = "examples/bandit/tune_lin_ucb_train_recsim_env.py", - tags = ["team:ml", "examples", ], - size = "small", - srcs = ["examples/bandit/tune_lin_ucb_train_recsim_env.py"], -) - # -------------------------------------------------------------------- # examples/documentation directory # diff --git a/rllib/env/wrappers/recsim.py b/rllib/env/wrappers/recsim.py index 56ef299dafce..6b2c76574fad 100644 --- a/rllib/env/wrappers/recsim.py +++ b/rllib/env/wrappers/recsim.py @@ -64,49 +64,6 @@ def observation(self, obs): return new_obs -class RecSimObservationBanditWrapper(gym.ObservationWrapper): - """Fix RecSim environment's observation format - - RecSim's observations are keyed by document IDs, and nested under - "doc" key. - Our Bandits agent expects the observations to be flat 2D array - and under "item" key. - - This environment wrapper converts obs into the right format. - """ - - def __init__(self, env: gym.Env): - super().__init__(env) - obs_space = self.env.observation_space - - num_items = len(obs_space["doc"]) - embedding_dim = next(iter(obs_space["doc"].values())).shape[-1] - self.observation_space = Dict( - OrderedDict( - [ - ("user", obs_space["user"]), - ( - "item", - gym.spaces.Box( - low=-np.ones((num_items, embedding_dim)), - high=np.ones((num_items, embedding_dim)), - ), - ), - ("response", obs_space["response"]), - ] - ) - ) - self._sampled_obs = self.observation_space.sample() - - def observation(self, obs): - new_obs = OrderedDict() - new_obs["user"] = obs["user"] - new_obs["item"] = np.vstack(list(obs["doc"].values())) - new_obs["response"] = obs["response"] - new_obs = convert_element_to_space_type(new_obs, self._sampled_obs) - return new_obs - - class RecSimResetWrapper(gym.Wrapper): """Fix RecSim environment's reset() and close() function @@ -160,9 +117,7 @@ def action(self, action: int) -> List[int]: def recsim_gym_wrapper( - recsim_gym_env: gym.Env, - convert_to_discrete_action_space: bool = False, - wrap_for_bandits: bool = False, + recsim_gym_env: gym.Env, convert_to_discrete_action_space: bool = False ) -> gym.Env: """Makes sure a RecSim gym.Env can ba handled by RLlib. @@ -186,8 +141,6 @@ def recsim_gym_wrapper( such as RLlib's DQN. If None, `convert_to_discrete_action_space` may also be provided via the EnvContext (config) when creating an actual env instance. - wrap_for_bandits: Bool indicating, whether this RecSim env should be - wrapped for use with our Bandits agent. Returns: An RLlib-ready gym.Env instance. @@ -196,8 +149,6 @@ def recsim_gym_wrapper( env = RecSimObservationSpaceWrapper(env) if convert_to_discrete_action_space: env = MultiDiscreteToDiscreteActionWrapper(env) - if wrap_for_bandits: - env = RecSimObservationBanditWrapper(env) return env @@ -235,7 +186,6 @@ def __init__(self, config: Optional[EnvContext] = None): "resample_documents": True, "seed": 0, "convert_to_discrete_action_space": False, - "wrap_for_bandits": False, } if config is None or isinstance(config, dict): config = EnvContext(config or default_config, worker_index=0) @@ -260,9 +210,7 @@ def __init__(self, config: Optional[EnvContext] = None): # Fix observation space and - if necessary - convert to discrete # action space (from multi-discrete). env = recsim_gym_wrapper( - gym_env, - config["convert_to_discrete_action_space"], - config["wrap_for_bandits"], + gym_env, env_ctx["convert_to_discrete_action_space"] ) # Call the super (Wrapper constructor) passing it the created env. super().__init__(env=env) diff --git a/rllib/env/wrappers/tests/test_recsim_wrapper.py b/rllib/env/wrappers/tests/test_recsim_wrapper.py index b34e10cbcca2..14a9bc395db4 100644 --- a/rllib/env/wrappers/tests/test_recsim_wrapper.py +++ b/rllib/env/wrappers/tests/test_recsim_wrapper.py @@ -28,11 +28,6 @@ def test_action_space_conversion(self): new_obs, _, _, _ = env.step(action) self.assertTrue(env.observation_space.contains(new_obs)) - def test_bandits_observation_space_conversion(self): - env = InterestEvolutionRecSimEnv({"wrap_for_bandits": True}) - # "item" of observation space is a Box space. - self.assertIsInstance(env.observation_space["item"], gym.spaces.Box) - def test_double_action_space_conversion_raises_exception(self): env = InterestEvolutionRecSimEnv({"convert_to_discrete_action_space": True}) with self.assertRaises(UnsupportedSpaceException): diff --git a/rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py b/rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py deleted file mode 100644 index 016f9c8f4563..000000000000 --- a/rllib/examples/bandit/tune_lin_ucb_train_recsim_env.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Example of using LinUCB on a RecSim environment. """ - -from matplotlib import pyplot as plt -import pandas as pd -import time - -from ray import tune -import ray.rllib.examples.env.recsim_recommender_system_envs # noqa - - -if __name__ == "__main__": - ray.init() - - config = { - # "RecSim-v1" is a pre-registered RecSim env. - # Alternatively, you can do: - # `from ray.rllib.examples.env.recsim_recommender_system_envs import ...` - # - LongTermSatisfactionRecSimEnv - # - InterestExplorationRecSimEnv - # - InterestEvolutionRecSimEnv - # Then: "env": [the imported RecSim class] - "env": "RecSim-v1", - "env_config": { - "convert_to_discrete_action_space": True, - "wrap_for_bandits": True, - }, - } - - # Actual training_iterations will be 10 * timesteps_per_iteration - # (100 by default) = 2,000 - training_iterations = 10 - - print("Running training for %s time steps" % training_iterations) - - start_time = time.time() - analysis = tune.run( - "BanditLinUCB", - config=config, - stop={"training_iteration": training_iterations}, - num_samples=1, - checkpoint_at_end=False, - ) - - print("The trials took", time.time() - start_time, "seconds\n") - - # Analyze cumulative regrets of the trials - frame = pd.DataFrame() - for key, df in analysis.trial_dataframes.items(): - frame = frame.append(df, ignore_index=True) - x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate( - ["mean", "max", "min", "std"] - ) - - plt.plot(x["mean"]) - plt.fill_between( - x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2 - ) - plt.title("Episode reward mean") - plt.xlabel("Training steps") - plt.show()