From d892f68e00bac59345cd3ac86c13536d182ad5c5 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Fri, 10 Jun 2022 10:34:23 -0700 Subject: [PATCH 1/4] added crr to algorithm docs --- doc/source/rllib/rllib-algorithms.rst | 23 +++++++++++++++++++++++ rllib/algorithms/crr/crr.py | 5 +++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst index d5d8babe14a1..6f259c119133 100644 --- a/doc/source/rllib/rllib-algorithms.rst +++ b/doc/source/rllib/rllib-algorithms.rst @@ -23,6 +23,7 @@ Algorithm Frameworks Discrete Actions Continuo `Bandits`_ (`TS`_ & `LinUCB`_) torch **Yes** `+parametric`_ No **Yes** No `BC`_ tf + torch **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ torch `CQL`_ tf + torch No **Yes** No tf + torch +`CRR`_ torch **Yes** `+parametric`_ **Yes** **Yes** torch `DDPG`_ tf + torch No **Yes** **Yes** torch `APEX-DDPG`_ tf + torch No **Yes** **Yes** torch `ES`_ tf + torch **Yes** **Yes** No No @@ -634,6 +635,28 @@ Tuned examples: `HalfCheetah Random `__ `[implementation] `__ + +CRR is another offline RL algorithm based on Q-learning that can learn from an offline experience replay. +The challenge in applying existing Q-learning algorithms to offline RL is the problem of overestimating Q-function, as well as lack of exploration beyond the observed data. +The later become increasingly important during bootstrapping the bellman equation, where the Q-function queried for next state does not have support in the observed data. +To mitigate these issues, CRR, implements a simple and yet powerful idea of value-filtered regression. +Basically the key idea is to use a learned critic to filter-out the non-promising transitions from the replay dataset. For more details, please refer to the paper. + +Tuned examples: `CartPole-v0 `__, `Pendulum-v1 `__ + +.. literalinclude:: ../../../rllib/algorithms/crr/crr.py + :language: python + :start-after: __sphinx_doc_begin__ + :end-before: __sphinx_doc_end__ + + Derivative-free ~~~~~~~~~~~~~~~ diff --git a/rllib/algorithms/crr/crr.py b/rllib/algorithms/crr/crr.py index bab79a2e509b..38e8d4f6b983 100644 --- a/rllib/algorithms/crr/crr.py +++ b/rllib/algorithms/crr/crr.py @@ -40,6 +40,8 @@ def __init__(self, trainer_class=None): self.n_action_sample = 4 self.twin_q = True self.target_update_grad_intervals = 100 + # __sphinx_doc_end__ + # fmt: on self.replay_buffer_config = { "type": "ReplayBuffer", "capacity": 50000, @@ -57,8 +59,7 @@ def __init__(self, trainer_class=None): self.critic_lr = 3e-4 self.actor_lr = 3e-4 self.tau = 5e-3 - # __sphinx_doc_end__ - # fmt: on + # overriding the trainer config default self.num_workers = 0 # offline RL does not need rollout workers From e0a3ad4002d8f551cc44703f14a982040f10506b Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Fri, 10 Jun 2022 10:34:52 -0700 Subject: [PATCH 2/4] lint --- rllib/algorithms/crr/crr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rllib/algorithms/crr/crr.py b/rllib/algorithms/crr/crr.py index 38e8d4f6b983..dfa2ca7c1fbb 100644 --- a/rllib/algorithms/crr/crr.py +++ b/rllib/algorithms/crr/crr.py @@ -60,7 +60,6 @@ def __init__(self, trainer_class=None): self.actor_lr = 3e-4 self.tau = 5e-3 - # overriding the trainer config default self.num_workers = 0 # offline RL does not need rollout workers From b8b5a4c88dd9e5439aa3f802a8ab87342ad4e390 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Fri, 10 Jun 2022 13:45:53 -0700 Subject: [PATCH 3/4] wip --- rllib/README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rllib/README.rst b/rllib/README.rst index 4c38130135c0..d3fe50954dac 100644 --- a/rllib/README.rst +++ b/rllib/README.rst @@ -60,7 +60,8 @@ Offline RL: - `Behavior Cloning (BC; derived from MARWIL implementation) `__ - `Conservative Q-Learning (CQL) `__ -- `Importance Sampling and Weighted Importance Sampling (OPE) `__ +- `Critic Regularized Regression (CRR) `__ +- `Importance Sampling and Weighted Importance Sampling (OPE) `__ - `Monotonic Advantage Re-Weighted Imitation Learning (MARWIL) `__ Model-free On-policy RL (for Games): From c3f916392b6ec00f1068f94ad1b1d99ac67567a0 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Tue, 14 Jun 2022 12:45:09 +0200 Subject: [PATCH 4/4] Apply suggestions from code review --- doc/source/rllib/rllib-algorithms.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst index 6f259c119133..d3b499527483 100644 --- a/doc/source/rllib/rllib-algorithms.rst +++ b/doc/source/rllib/rllib-algorithms.rst @@ -644,10 +644,10 @@ Critic Regularized Regression (CRR) `[paper] `__ `[implementation] `__ CRR is another offline RL algorithm based on Q-learning that can learn from an offline experience replay. -The challenge in applying existing Q-learning algorithms to offline RL is the problem of overestimating Q-function, as well as lack of exploration beyond the observed data. -The later become increasingly important during bootstrapping the bellman equation, where the Q-function queried for next state does not have support in the observed data. -To mitigate these issues, CRR, implements a simple and yet powerful idea of value-filtered regression. -Basically the key idea is to use a learned critic to filter-out the non-promising transitions from the replay dataset. For more details, please refer to the paper. +The challenge in applying existing Q-learning algorithms to offline RL lies in the overestimation of the Q-function, as well as, the lack of exploration beyond the observed data. +The latter becomes increasingly important during bootstrapping in the bellman equation, where the Q-function queried for the next state's Q-value(s) does not have support in the observed data. +To mitigate these issues, CRR implements a simple and yet powerful idea of "value-filtered regression". +The key idea is to use a learned critic to filter-out the non-promising transitions from the replay dataset. For more details, please refer to the paper (see link above). Tuned examples: `CartPole-v0 `__, `Pendulum-v1 `__