From e38a82595efe22cfae8f0f36a34e3e3c87c4805c Mon Sep 17 00:00:00 2001
From: Ishant Mrinal Haloi <mrinal.haloi11@gmail.com>
Date: Sat, 18 Dec 2021 18:46:49 +0530
Subject: [PATCH 1/8] add re3 documentation

---
 doc/source/rllib-algorithms.rst | 61 +++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 70d12bbaf9fe..660003e0f1c8 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -887,3 +887,64 @@ Intrinsic rewards for each env-step are calculated by taking the euclidian dista
 ("forward" model).
 This allows the agent to explore areas of the environment, where the "forward" model still performs poorly (are not "understood" yet), whereas exploration to these areas will taper down after the agent has visited them
 often: The "forward" model will eventually get better at predicting these next latent vectors, which in turn will diminish the intrinsic rewards (decrease the euclidian distance between predicted and actual vectors).
+
+
+
+.. _RE3:
+
+RE3 (Random Encoders for Efficient Exploration)
+-----------------------------------------------
+
+|Tensorflow|
+`[paper] <https://arxiv.org/pdf/2102.09430.pdf>`__
+`[implementation] <https://github.com/ray-project/ray/blob/master/rllib/utils/exploration/random_encoder.py>`__
+
+Tuned examples:
+`LunarLanderContinuous-v2 <https://github.com/ray-project/ray/blob/master/rllib/examples/re3_exploration.py>`__ (use ``--env LunarLanderContinuous-v2`` command line option)
+`Test case with Pendulum-v1 example <https://github.com/ray-project/ray/blob/master/rllib/utils/exploration/tests/test_random_encoder.py>`__
+
+**Activating RE3**
+The RE3 plugin can be easily activated by specifying it as the Exploration class to-be-used
+in the main Trainer config and inheriting the `RE3UpdateCallbacks` as shown in this `example <https://github.com/ray-project/ray/blob/c9c3f0745a9291a4de0872bdfa69e4ffdfac3657/rllib/utils/exploration/tests/test_random_encoder.py#L35>`__. Most of its parameters usually do not have to be specified as the module uses the values from the paper by default. For example:
+
+.. code-block:: python
+
+    config = sac.DEFAULT_CONFIG.copy()
+    config["env"] = "Pendulum-v1"
+    config["seed"] = 12345
+    config["callbacks"] = RE3Callbacks
+    config["exploration_config"] = {
+    	"type": "RE3",
+         # the dimensionality of the observation embedding vectors in latent space.
+         "embeds_dim": 128, 
+	 "rho": 0.1, # Beta decay factor, used for on-policy algorithm.
+         "k_nn": 50, # Number of neighbours to set for K-NN entropy estimation.
+         # Configuration for the encoder network, producing embedding vectors from observations.
+         # This can be used to configure fcnet- or conv_net setups to properly process any
+         # observation space. By default uses the Policy model configuration.
+         "encoder_net_config": {
+             "fcnet_hiddens": [],
+             "fcnet_activation": "relu",
+         },
+         "beta": 0.2 # Hyperparameter to choose between exploration and exploitation.
+         # Schedule to use for beta decay, one of constant" or "linear_decay".
+         "beta_schedule": 'constant', 
+         # Specify, which exploration sub-type to use (usually, the algo's "default"
+         # exploration, e.g. EpsilonGreedy for DQN, StochasticSampling for PG/SAC).
+         "sub_exploration": {
+             "type": "StochasticSampling",
+         }
+    }
+
+
+**Functionality**
+RLlib's RE3 is based on `"Random Encoders for Efficient Exploration" described in this paper here <https://arxiv.org/pdf/2102.09430.pdf>`__.
+RE3 quantifies exploration based on state entropy. The entropy of a state is calculated based on its distance from K nearest neighbors states present in the replay buffer (we use train batch in this implementation) in the latent space. 
+The entropy of the state is considered as intrinsic rewards and added to the extrinsic rewards for policy optimization when external rewards are available, else used as "intrinsic rewards" for unsupervised pre-training of RL agent. RE3 allows agents to learn in sparse-reward or even no-reward environments by
+considering the state entropy as "intrinsic rewards".
+
+This exploration objective can be used with both model-free and model-based RL algorithms. 
+RE3 uses a randomly initialized encoder to get the state’s latent representation, thus taking away the complexity of training the representation learning method. The encoder weights are fixed during the entire duration of the training process. 
+
+
+

From 0e688a080fc0441189bb163308d7606333b99e38 Mon Sep 17 00:00:00 2001
From: Ishant Mrinal Haloi <mrinal.haloi11@gmail.com>
Date: Sat, 18 Dec 2021 18:49:03 +0530
Subject: [PATCH 2/8] remove empty lines

---
 doc/source/rllib-algorithms.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 660003e0f1c8..63e3c1ec1944 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -945,6 +945,3 @@ considering the state entropy as "intrinsic rewards".
 
 This exploration objective can be used with both model-free and model-based RL algorithms. 
 RE3 uses a randomly initialized encoder to get the state’s latent representation, thus taking away the complexity of training the representation learning method. The encoder weights are fixed during the entire duration of the training process. 
-
-
-

From 0170b616710b9534ae5fe00abeda843349fe48d7 Mon Sep 17 00:00:00 2001
From: Ishant Mrinal <33053278+n30111@users.noreply.github.com>
Date: Mon, 20 Dec 2021 15:55:48 +0530
Subject: [PATCH 3/8] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jeroen Bédorf <jeroen@minds.ai>
---
 doc/source/rllib-algorithms.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 63e3c1ec1944..82afcbac440d 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -927,7 +927,7 @@ in the main Trainer config and inheriting the `RE3UpdateCallbacks` as shown in t
              "fcnet_activation": "relu",
          },
          "beta": 0.2 # Hyperparameter to choose between exploration and exploitation.
-         # Schedule to use for beta decay, one of constant" or "linear_decay".
+         # Schedule to use for beta decay, one of "constant" or "linear_decay".
          "beta_schedule": 'constant', 
          # Specify, which exploration sub-type to use (usually, the algo's "default"
          # exploration, e.g. EpsilonGreedy for DQN, StochasticSampling for PG/SAC).
@@ -939,9 +939,10 @@ in the main Trainer config and inheriting the `RE3UpdateCallbacks` as shown in t
 
 **Functionality**
 RLlib's RE3 is based on `"Random Encoders for Efficient Exploration" described in this paper here <https://arxiv.org/pdf/2102.09430.pdf>`__.
-RE3 quantifies exploration based on state entropy. The entropy of a state is calculated based on its distance from K nearest neighbors states present in the replay buffer (we use train batch in this implementation) in the latent space. 
-The entropy of the state is considered as intrinsic rewards and added to the extrinsic rewards for policy optimization when external rewards are available, else used as "intrinsic rewards" for unsupervised pre-training of RL agent. RE3 allows agents to learn in sparse-reward or even no-reward environments by
-considering the state entropy as "intrinsic rewards".
+RE3 quantifies exploration based on state entropy. The entropy of a state is calculated based on its distance from K nearest neighbors states present in the replay buffer in the latent space (train batch is used in this implementation). 
+The state entropy is considered as an intrinsic reward and for policy optimization added to the extrinsic reward when available.  If the extrinsic reward is not available then the state entropy is used as "intrinsic reward" for unsupervised pre-training of the RL agent. 
+RE3 further allows agents to learn in sparse-reward or even no-reward environments by
+using the state entropy as "intrinsic rewards".
 
 This exploration objective can be used with both model-free and model-based RL algorithms. 
 RE3 uses a randomly initialized encoder to get the state’s latent representation, thus taking away the complexity of training the representation learning method. The encoder weights are fixed during the entire duration of the training process. 

From 52189881e03373ab245cb8bf0eabc2809100c0f7 Mon Sep 17 00:00:00 2001
From: Ishant Mrinal Haloi <mrinal.haloi11@gmail.com>
Date: Mon, 20 Dec 2021 18:29:05 +0530
Subject: [PATCH 4/8] update doc string

---
 doc/source/rllib-algorithms.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 82afcbac440d..e21f6574d1ff 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -926,8 +926,11 @@ in the main Trainer config and inheriting the `RE3UpdateCallbacks` as shown in t
              "fcnet_hiddens": [],
              "fcnet_activation": "relu",
          },
-         "beta": 0.2 # Hyperparameter to choose between exploration and exploitation.
-         # Schedule to use for beta decay, one of "constant" or "linear_decay".
+         # Hyperparameter to choose between exploration and exploitation. A higher value of beta add
+         # more importance to the intrinsic reward, as per the following equation 
+         # `reward = r + beta * intrinsic_reward`
+         "beta": 0.2,
+         # Schedule to use for beta decay, one of constant" or "linear_decay".
          "beta_schedule": 'constant', 
          # Specify, which exploration sub-type to use (usually, the algo's "default"
          # exploration, e.g. EpsilonGreedy for DQN, StochasticSampling for PG/SAC).

From 530f60003e1b9b1b76ea3d4c81f68b4815044392 Mon Sep 17 00:00:00 2001
From: Ishant Mrinal <33053278+n30111@users.noreply.github.com>
Date: Mon, 20 Dec 2021 18:50:11 +0530
Subject: [PATCH 5/8] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jeroen Bédorf <jeroen@minds.ai>
---
 doc/source/rllib-algorithms.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index e21f6574d1ff..0bfe83a20fb4 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -917,7 +917,7 @@ in the main Trainer config and inheriting the `RE3UpdateCallbacks` as shown in t
     	"type": "RE3",
          # the dimensionality of the observation embedding vectors in latent space.
          "embeds_dim": 128, 
-	 "rho": 0.1, # Beta decay factor, used for on-policy algorithm.
+         "rho": 0.1, # Beta decay factor, used for on-policy algorithm.
          "k_nn": 50, # Number of neighbours to set for K-NN entropy estimation.
          # Configuration for the encoder network, producing embedding vectors from observations.
          # This can be used to configure fcnet- or conv_net setups to properly process any
@@ -926,7 +926,7 @@ in the main Trainer config and inheriting the `RE3UpdateCallbacks` as shown in t
              "fcnet_hiddens": [],
              "fcnet_activation": "relu",
          },
-         # Hyperparameter to choose between exploration and exploitation. A higher value of beta add
+         # Hyperparameter to choose between exploration and exploitation. A higher value of beta adds
          # more importance to the intrinsic reward, as per the following equation 
          # `reward = r + beta * intrinsic_reward`
          "beta": 0.2,

From 4f689b39285ade19cacb7811067d8ca828523cb9 Mon Sep 17 00:00:00 2001
From: Ishant Mrinal Haloi <mrinal.haloi11@gmail.com>
Date: Fri, 24 Dec 2021 12:20:47 +0530
Subject: [PATCH 6/8] address pr comments

---
 doc/source/rllib-algorithms.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 0bfe83a20fb4..71a6b2d8759d 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -899,7 +899,7 @@ RE3 (Random Encoders for Efficient Exploration)
 `[paper] <https://arxiv.org/pdf/2102.09430.pdf>`__
 `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/utils/exploration/random_encoder.py>`__
 
-Tuned examples:
+Examples:
 `LunarLanderContinuous-v2 <https://github.com/ray-project/ray/blob/master/rllib/examples/re3_exploration.py>`__ (use ``--env LunarLanderContinuous-v2`` command line option)
 `Test case with Pendulum-v1 example <https://github.com/ray-project/ray/blob/master/rllib/utils/exploration/tests/test_random_encoder.py>`__
 
@@ -942,7 +942,7 @@ in the main Trainer config and inheriting the `RE3UpdateCallbacks` as shown in t
 
 **Functionality**
 RLlib's RE3 is based on `"Random Encoders for Efficient Exploration" described in this paper here <https://arxiv.org/pdf/2102.09430.pdf>`__.
-RE3 quantifies exploration based on state entropy. The entropy of a state is calculated based on its distance from K nearest neighbors states present in the replay buffer in the latent space (train batch is used in this implementation). 
+RE3 quantifies exploration based on state entropy. The entropy of a state is calculated based on its distance from K nearest neighbor states present in the replay buffer in the latent space (With this implementation, KNN is implemented using training samples from the same batch). 
 The state entropy is considered as an intrinsic reward and for policy optimization added to the extrinsic reward when available.  If the extrinsic reward is not available then the state entropy is used as "intrinsic reward" for unsupervised pre-training of the RL agent. 
 RE3 further allows agents to learn in sparse-reward or even no-reward environments by
 using the state entropy as "intrinsic rewards".

From ff99c84cf15129ce48d60c0d1d1974d21d8f31cf Mon Sep 17 00:00:00 2001
From: n3011 <mrinal.haloi11@gmail.com>
Date: Tue, 20 Dec 2022 03:54:15 +0000
Subject: [PATCH 7/8] fix to_tf issue for dict features

Signed-off-by: n3011 <mrinal.haloi11@gmail.com>
---
 python/ray/data/dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
index 4442ba449eb5..ab6774a87476 100644
--- a/python/ray/data/dataset.py
+++ b/python/ray/data/dataset.py
@@ -3204,7 +3204,9 @@ def convert_batch_to_tensors(
             if isinstance(columns, str):
                 return convert_ndarray_to_tf_tensor(batch[columns], type_spec=type_spec)
             return {
-                convert_ndarray_to_tf_tensor(batch[column], type_spec=type_spec[column])
+                column: convert_ndarray_to_tf_tensor(
+                    batch[column], type_spec=type_spec[column]
+                )
                 for column in columns
             }
 

From bcc0f25aec5fec3e986a6a0b08cf835db00f7d3e Mon Sep 17 00:00:00 2001
From: n3011 <mrinal.haloi11@gmail.com>
Date: Tue, 20 Dec 2022 12:24:06 +0000
Subject: [PATCH 8/8] add test for to_tf multiple feature columns

Signed-off-by: n3011 <mrinal.haloi11@gmail.com>
---
 python/ray/data/tests/test_dataset_tf.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/python/ray/data/tests/test_dataset_tf.py b/python/ray/data/tests/test_dataset_tf.py
index 6ff6a37015c1..235fdee19354 100644
--- a/python/ray/data/tests/test_dataset_tf.py
+++ b/python/ray/data/tests/test_dataset_tf.py
@@ -46,6 +46,27 @@ def test_element_spec_type_with_multiple_columns(self):
             for value in feature_output_signature.values()
         )
 
+        df = pd.DataFrame(
+            {"feature1": [0, 1, 2], "feature2": [3, 4, 5], "label": [0, 1, 1]}
+        )
+        ds = ray.data.from_pandas(df)
+        dataset = ds.to_tf(
+            feature_columns=["feature1", "feature2"],
+            label_columns="label",
+            batch_size=3,
+        )
+        feature_output_signature, _ = dataset.element_spec
+        assert isinstance(feature_output_signature, dict)
+        assert feature_output_signature.keys() == {"feature1", "feature2"}
+        assert all(
+            isinstance(value, tf.TypeSpec)
+            for value in feature_output_signature.values()
+        )
+        features, labels = next(iter(dataset))
+        assert (labels.numpy() == df["label"].values).all()
+        assert (features["feature1"].numpy() == df["feature1"].values).all()
+        assert (features["feature2"].numpy() == df["feature2"].values).all()
+
     def test_element_spec_name(self):
         ds = ray.data.from_items([{"spam": 0, "ham": 0}])