From d892f68e00bac59345cd3ac86c13536d182ad5c5 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 10 Jun 2022 10:34:23 -0700
Subject: [PATCH 1/4] added crr to algorithm docs

---
 doc/source/rllib/rllib-algorithms.rst | 23 +++++++++++++++++++++++
 rllib/algorithms/crr/crr.py           |  5 +++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst
index d5d8babe14a1..6f259c119133 100644
--- a/doc/source/rllib/rllib-algorithms.rst
+++ b/doc/source/rllib/rllib-algorithms.rst
@@ -23,6 +23,7 @@ Algorithm                      Frameworks Discrete Actions              Continuo
 `Bandits`_ (`TS`_ & `LinUCB`_) torch      **Yes** `+parametric`_        No                 **Yes**                                                                   No
 `BC`_                          tf + torch **Yes** `+parametric`_        **Yes**            **Yes**     `+RNN`_                                                       torch
 `CQL`_                         tf + torch No                            **Yes**            No                                                                        tf + torch
+`CRR`_                         torch      **Yes** `+parametric`_        **Yes**            **Yes**                                                                   torch
 `DDPG`_                        tf + torch No                            **Yes**            **Yes**                                                                   torch
 `APEX-DDPG`_                   tf + torch No                            **Yes**            **Yes**                                                                   torch
 `ES`_                          tf + torch **Yes**                       **Yes**            No                                                                        No
@@ -634,6 +635,28 @@ Tuned examples: `HalfCheetah Random <https://github.com/ray-project/ray/blob/mas
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+
+.. _crr:
+
+Critic Regularized Regression (CRR)
+-----------------------------------
+|pytorch|
+`[paper] <https://arxiv.org/abs/2006.15134>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/algorithms/crr/crr.py>`__
+
+CRR is another offline RL algorithm based on Q-learning that can learn from an offline experience replay.
+The challenge in applying existing Q-learning algorithms to offline RL is the problem of overestimating Q-function, as well as lack of exploration beyond the observed data.
+The later become increasingly important during bootstrapping the bellman equation, where the Q-function queried for next state does not have support in the observed data.
+To mitigate these issues, CRR, implements a simple and yet powerful idea of value-filtered regression.
+Basically the key idea is to use a learned critic to filter-out the non-promising transitions from the replay dataset. For more details, please refer to the paper.
+
+Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/crr/cartpole-v0-crr.yaml>`__, `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/crr/pendulum-v1-crr.yaml>`__
+
+.. literalinclude:: ../../../rllib/algorithms/crr/crr.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
+
 Derivative-free
 ~~~~~~~~~~~~~~~
 
diff --git a/rllib/algorithms/crr/crr.py b/rllib/algorithms/crr/crr.py
index bab79a2e509b..38e8d4f6b983 100644
--- a/rllib/algorithms/crr/crr.py
+++ b/rllib/algorithms/crr/crr.py
@@ -40,6 +40,8 @@ def __init__(self, trainer_class=None):
         self.n_action_sample = 4
         self.twin_q = True
         self.target_update_grad_intervals = 100
+        # __sphinx_doc_end__
+        # fmt: on
         self.replay_buffer_config = {
             "type": "ReplayBuffer",
             "capacity": 50000,
@@ -57,8 +59,7 @@ def __init__(self, trainer_class=None):
         self.critic_lr = 3e-4
         self.actor_lr = 3e-4
         self.tau = 5e-3
-        # __sphinx_doc_end__
-        # fmt: on
+
 
         # overriding the trainer config default
         self.num_workers = 0  # offline RL does not need rollout workers

From e0a3ad4002d8f551cc44703f14a982040f10506b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 10 Jun 2022 10:34:52 -0700
Subject: [PATCH 2/4] lint

---
 rllib/algorithms/crr/crr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rllib/algorithms/crr/crr.py b/rllib/algorithms/crr/crr.py
index 38e8d4f6b983..dfa2ca7c1fbb 100644
--- a/rllib/algorithms/crr/crr.py
+++ b/rllib/algorithms/crr/crr.py
@@ -60,7 +60,6 @@ def __init__(self, trainer_class=None):
         self.actor_lr = 3e-4
         self.tau = 5e-3
 
-
         # overriding the trainer config default
         self.num_workers = 0  # offline RL does not need rollout workers
 

From b8b5a4c88dd9e5439aa3f802a8ab87342ad4e390 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 10 Jun 2022 13:45:53 -0700
Subject: [PATCH 3/4] wip

---
 rllib/README.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rllib/README.rst b/rllib/README.rst
index 4c38130135c0..d3fe50954dac 100644
--- a/rllib/README.rst
+++ b/rllib/README.rst
@@ -60,7 +60,8 @@ Offline RL:
 
 - `Behavior Cloning (BC; derived from MARWIL implementation) <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#bc>`__ 
 - `Conservative Q-Learning (CQL) <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#cql>`__ 
-- `Importance Sampling and Weighted Importance Sampling (OPE) <https://docs.ray.io/en/latest/rllib/rllib-offline.html#is>`__ 
+- `Critic Regularized Regression (CRR) <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#crr>`__
+- `Importance Sampling and Weighted Importance Sampling (OPE) <https://docs.ray.io/en/latest/rllib/rllib-offline.html#is>`__
 - `Monotonic Advantage Re-Weighted Imitation Learning (MARWIL) <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#marwil>`__ 
 
 Model-free On-policy RL (for Games):

From c3f916392b6ec00f1068f94ad1b1d99ac67567a0 Mon Sep 17 00:00:00 2001
From: Sven Mika <sven@anyscale.io>
Date: Tue, 14 Jun 2022 12:45:09 +0200
Subject: [PATCH 4/4] Apply suggestions from code review

---
 doc/source/rllib/rllib-algorithms.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst
index 6f259c119133..d3b499527483 100644
--- a/doc/source/rllib/rllib-algorithms.rst
+++ b/doc/source/rllib/rllib-algorithms.rst
@@ -644,10 +644,10 @@ Critic Regularized Regression (CRR)
 `[paper] <https://arxiv.org/abs/2006.15134>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/algorithms/crr/crr.py>`__
 
 CRR is another offline RL algorithm based on Q-learning that can learn from an offline experience replay.
-The challenge in applying existing Q-learning algorithms to offline RL is the problem of overestimating Q-function, as well as lack of exploration beyond the observed data.
-The later become increasingly important during bootstrapping the bellman equation, where the Q-function queried for next state does not have support in the observed data.
-To mitigate these issues, CRR, implements a simple and yet powerful idea of value-filtered regression.
-Basically the key idea is to use a learned critic to filter-out the non-promising transitions from the replay dataset. For more details, please refer to the paper.
+The challenge in applying existing Q-learning algorithms to offline RL lies in the overestimation of the Q-function, as well as, the lack of exploration beyond the observed data.
+The latter becomes increasingly important during bootstrapping in the bellman equation, where the Q-function queried for the next state's Q-value(s) does not have support in the observed data.
+To mitigate these issues, CRR implements a simple and yet powerful idea of "value-filtered regression".
+The key idea is to use a learned critic to filter-out the non-promising transitions from the replay dataset. For more details, please refer to the paper (see link above).
 
 Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/crr/cartpole-v0-crr.yaml>`__, `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/crr/pendulum-v1-crr.yaml>`__