From 8e63d3c1b5e5f4558594b325211fca1c170a22dd Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 09:29:47 +0000
Subject: [PATCH 01/21] Implement GaussianSquashedGaussian.  Still buggy

---
 rllib/models/tf/tf_action_dist.py | 117 +++++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 19 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 2bac4f4bc52e..8fb4d9ce8df2 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -190,13 +190,7 @@ def required_model_output_shape(action_space, model_config):
         return np.prod(action_space.shape) * 2
 
 
-class SquashedGaussian(TFActionDistribution):
-    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
-
-    The distribution will never return low or high exactly, but
-    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
-    """
-
+class _SquashedGaussianBase(TFActionDistribution):
     def __init__(self, inputs, model, low=-1.0, high=1.0):
         """Parameterizes the distribution via `inputs`.
 
@@ -209,15 +203,38 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
         assert tfp is not None
         loc, log_scale = tf.split(inputs, 2, axis=-1)
         # Clip `scale` values (coming from NN) to reasonable values.
-        log_scale = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
-                                     MAX_LOG_NN_OUTPUT)
-        scale = tf.exp(log_scale)
+        self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
+                                        MAX_LOG_NN_OUTPUT)
+        scale = tf.exp(self.log_std)
         self.distr = tfp.distributions.Normal(loc=loc, scale=scale)
         assert np.all(np.less(low, high))
         self.low = low
         self.high = high
         super().__init__(inputs, model)
 
+    @override(ActionDistribution)
+    def deterministic_sample(self):
+        mean = self.distr.mean()
+        return self._squash(mean)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self):
+        return self._squash(self.distr.sample())
+
+    def _squash(self, raw_values):
+        raise NotImplementedError
+
+    def _unsquash(self, values):
+        raise NotImplementedError
+
+
+class SquashedGaussian(_SquashedGaussianBase):
+    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+
     @override(TFActionDistribution)
     def sampled_action_logp(self):
         unsquashed_values = self._unsquash(self.sample_op)
@@ -229,15 +246,6 @@ def sampled_action_logp(self):
             axis=-1)
         return log_prob
 
-    @override(ActionDistribution)
-    def deterministic_sample(self):
-        mean = self.distr.mean()
-        return self._squash(mean)
-
-    @override(TFActionDistribution)
-    def _build_sample_op(self):
-        return self._squash(self.distr.sample())
-
     @override(ActionDistribution)
     def logp(self, x):
         unsquashed_values = self._unsquash(x)
@@ -263,6 +271,77 @@ def _unsquash(self, values):
                              (self.high - self.low) * 2.0 - 1.0)
 
 
+class GaussianSquashedGaussian(_SquashedGaussianBase):
+    """A gaussian CDF-squashed Gaussian distribution.
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+    # Chosen to match the standard logistic variance, so that:
+    #   Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1))
+    _SCALE = 0.5 * 1.8137
+
+    @override(ActionDistribution)
+    def logp(self, x):
+        unsquashed_values = self._unsquash(x)
+        log_prob = tf.reduce_sum(
+            self.distr.log_prob(value=unsquashed_values), axis=-1)
+        u = (unsquashed_values - self.low) / (self.high - self.low)
+        dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
+        log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1)
+        log_prob += tf.log(self.high - self.low)
+        return log_prob
+
+    @override(ActionDistribution)
+    def kl(self, other):
+        # KL(self || other) is just the KL of the two unsquashed distributions.
+        assert isinstance(other, GaussianSquashedGaussian)
+
+        mean = self.distr.mean()
+        std = self.distr.std()
+
+        other_mean = other.distr.mean()
+        other_std = other.distr.std()
+
+        return tf.reduce_sum(
+            other.log_std - self.log_std +
+            (tf.square(std) + tf.square(mean - other_mean)) /
+            (2.0 * tf.square(other_std)) - 0.5,
+            axis=1)
+
+    def entropy(self):
+        # Entropy is:
+        #   -KL(self.distr || N(0, _SCALE)) + log(high - low)
+        # where the latter distribution's CDF is used to do the squashing.
+
+        mean = self.distr.mean()
+        std = self.distr.std()
+
+        return tf.reduce_sum(
+            log(self.high - self.low) -
+            (tf.log(self._SCALE) - self.log_std +
+             (tf.square(std) + tf.square(mean)) /
+             (2.0 * tf.square(self._SCALE)) - 0.5))
+
+    def _squash(self, raw_values):
+        # Make sure raw_values are not too high/low (such that tanh would
+        # return exactly 1.0/-1.0, which would lead to +/-inf log-probs).
+
+        values = tfp.bijectors.NormalCDF().forward(
+                raw_values / self._SCALE
+        )
+        return (tf.clip_by_value(values,
+                                 SMALL_NUMBER,
+                                 1.0 - SMALL_NUMBER) *
+                (self.high - self.low) + self.low)
+
+    def _unsquash(self, values):
+        return self._SCALE * tfp.bijectors.NormalCDF().inverse(
+            (values - self.low) / (self.high - self.low)
+        )
+
+
+
 class Deterministic(TFActionDistribution):
     """Action distribution that returns the input values directly.
 

From 005c52420230013db6de5d2f840c616e18e8be75 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 10:00:24 +0000
Subject: [PATCH 02/21] fix bug in gsg logp

---
 rllib/models/tf/tf_action_dist.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 8fb4d9ce8df2..81c1cb5a0769 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -286,12 +286,12 @@ def logp(self, x):
         unsquashed_values = self._unsquash(x)
         log_prob = tf.reduce_sum(
             self.distr.log_prob(value=unsquashed_values), axis=-1)
-        u = (unsquashed_values - self.low) / (self.high - self.low)
-        dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
-        log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1)
-        log_prob += tf.log(self.high - self.low)
+        squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
+        log_prob -= tf.reduce_sum(
+            squash_dist.log_prob(value=unsquashed_values), axis=-1)
+        log_prob -= tf.log(self.high - self.low)
         return log_prob
-
+        
     @override(ActionDistribution)
     def kl(self, other):
         # KL(self || other) is just the KL of the two unsquashed distributions.

From ba69bb7ceecb4d2ea19e6c1ca9870f3ce4ae0423 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 18:44:11 +0000
Subject: [PATCH 03/21] Fix bugs in KL and entropy methods

---
 rllib/models/tf/tf_action_dist.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 81c1cb5a0769..faad2a4093b8 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -297,31 +297,28 @@ def kl(self, other):
         # KL(self || other) is just the KL of the two unsquashed distributions.
         assert isinstance(other, GaussianSquashedGaussian)
 
-        mean = self.distr.mean()
-        std = self.distr.std()
+        mean = self.distr.loc
+        std = self.distr.scale
 
-        other_mean = other.distr.mean()
-        other_std = other.distr.std()
+        other_mean = other.distr.loc
+        other_std = other.distr.scale
 
-        return tf.reduce_sum(
-            other.log_std - self.log_std +
-            (tf.square(std) + tf.square(mean - other_mean)) /
-            (2.0 * tf.square(other_std)) - 0.5,
-            axis=1)
+        return (other.log_std - self.log_std +
+                (tf.square(std) + tf.square(mean - other_mean)) /
+                (2.0 * tf.square(other_std)) - 0.5)
 
     def entropy(self):
         # Entropy is:
         #   -KL(self.distr || N(0, _SCALE)) + log(high - low)
         # where the latter distribution's CDF is used to do the squashing.
 
-        mean = self.distr.mean()
-        std = self.distr.std()
+        mean = self.distr.loc
+        std = self.distr.scale
 
-        return tf.reduce_sum(
-            log(self.high - self.low) -
-            (tf.log(self._SCALE) - self.log_std +
-             (tf.square(std) + tf.square(mean)) /
-             (2.0 * tf.square(self._SCALE)) - 0.5))
+        return (tf.log(self.high - self.low) -
+                (tf.log(self._SCALE) - self.log_std +
+                (tf.square(std) + tf.square(mean)) /
+                (2.0 * tf.square(self._SCALE)) - 0.5))
 
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would

From 113fc4ff47a68e46b6409d4c43b61bbd3964e484 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 20:54:56 +0000
Subject: [PATCH 04/21] Initial attempt at integrating GSG into catalog

Still some bugs to fix
---
 rllib/models/catalog.py           | 34 +++++++++++++++++++++++---
 rllib/models/tf/tf_action_dist.py | 40 ++++++++++++++++---------------
 2 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 4fd864fde3ae..49c7c8f62506 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -13,7 +13,8 @@
 from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork as FCNetV2
 from ray.rllib.models.tf.visionnet_v2 import VisionNetwork as VisionNetV2
 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \
-    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet
+    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \
+    GaussianSquashedGaussian
 from ray.rllib.models.preprocessors import get_preprocessor
 from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork
 from ray.rllib.models.tf.lstm_v1 import LSTM
@@ -104,6 +105,26 @@ class ModelCatalog:
         >>> action = dist.sample()
     """
 
+    @staticmethod
+    def _make_bounded_dist(action_space):
+        child_dists = []
+
+        low = np.ravel(action_space.low)
+        high = np.ravel(action_space.high)
+
+        for l, h in zip(low, high):
+            if np.isinf(l) and np.isinf(h):
+                dist = partial(GaussianSquashedGaussian, low=l, high=h)
+            else:
+                dist = DiagGaussian
+            child_dists.append(dist)
+
+        return partial(
+            MultiActionDistribution,
+            action_space=action_space,
+            child_distributions=child_dists,
+            input_lens=[2] * len(child_dists)), 2 * len(child_dists)
+
     @staticmethod
     @DeveloperAPI
     def get_action_dist(action_space,
@@ -147,9 +168,16 @@ def get_action_dist(action_space,
                     "Consider reshaping this into a single dimension, "
                     "using a custom action distribution, "
                     "using a Tuple action space, or the multi-agent API.")
-            # TODO(sven): Check for bounds and return SquashedNormal, etc..
             if dist_type is None:
-                dist = DiagGaussian if framework == "tf" else TorchDiagGaussian
+                any_bounded = np.any(action_space.bounded_below &
+                                     action_space.bounded_above)
+                if framework != "tf":
+                    return TorchDiagGaussian
+                elif np.any(action_space.bounded_below &
+                            action_space.bounded_above):
+                    return ModelCatalog._make_bounded_dist(action_space)
+                else:
+                    dist = TorchDiagGaussian
             elif dist_type == "deterministic":
                 dist = Deterministic
         # Discrete Space -> Categorical.
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index faad2a4093b8..ecc3b1ed6827 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -217,6 +217,13 @@ def deterministic_sample(self):
         mean = self.distr.mean()
         return self._squash(mean)
 
+    @override(ActionDistribution)
+    def logp(self, x):
+        unsquashed_values = self._unsquash(x)
+        log_prob = tf.reduce_sum(
+            self.distr.log_prob(value=unsquashed_values), axis=-1)
+        return log_prob - self._log_squash_grad(unsquashed_values)
+
     @override(TFActionDistribution)
     def _build_sample_op(self):
         return self._squash(self.distr.sample())
@@ -227,6 +234,9 @@ def _squash(self, raw_values):
     def _unsquash(self, values):
         raise NotImplementedError
 
+    def _log_squash_grad(self, unsquashed_values):
+        raise NotImplementedError
+
 
 class SquashedGaussian(_SquashedGaussianBase):
     """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
@@ -246,16 +256,11 @@ def sampled_action_logp(self):
             axis=-1)
         return log_prob
 
-    @override(ActionDistribution)
-    def logp(self, x):
-        unsquashed_values = self._unsquash(x)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(value=unsquashed_values), axis=-1)
+    def _log_squash_grad(self, unsquashed_values):
         unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
-        log_prob -= tf.math.reduce_sum(
+        return tf.math.reduce_sum(
             tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER),
             axis=-1)
-        return log_prob
 
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
@@ -266,6 +271,7 @@ def _squash(self, raw_values):
             1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \
                self.low
 
+
     def _unsquash(self, values):
         return tf.math.atanh((values - self.low) /
                              (self.high - self.low) * 2.0 - 1.0)
@@ -278,20 +284,9 @@ class GaussianSquashedGaussian(_SquashedGaussianBase):
     `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
     """
     # Chosen to match the standard logistic variance, so that:
-    #   Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1))
+    #   Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1))
     _SCALE = 0.5 * 1.8137
 
-    @override(ActionDistribution)
-    def logp(self, x):
-        unsquashed_values = self._unsquash(x)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(value=unsquashed_values), axis=-1)
-        squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
-        log_prob -= tf.reduce_sum(
-            squash_dist.log_prob(value=unsquashed_values), axis=-1)
-        log_prob -= tf.log(self.high - self.low)
-        return log_prob
-        
     @override(ActionDistribution)
     def kl(self, other):
         # KL(self || other) is just the KL of the two unsquashed distributions.
@@ -320,6 +315,13 @@ def entropy(self):
                 (tf.square(std) + tf.square(mean)) /
                 (2.0 * tf.square(self._SCALE)) - 0.5))
 
+    def _log_squash_grad(self, unsquashed_values):
+        squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
+        log_grad = tf.reduce_sum(
+            squash_dist.log_prob(value=unsquashed_values), axis=-1)
+        log_grad += tf.log(self.high - self.low)
+        return log_grad
+
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
         # return exactly 1.0/-1.0, which would lead to +/-inf log-probs).

From c8e53ced9bccfc63d5934018562760e53d591be1 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Sat, 14 Mar 2020 13:16:08 +0000
Subject: [PATCH 05/21] Fix up the shapes returned by SG

---
 rllib/models/catalog.py           |  5 ++-
 rllib/models/tf/tf_action_dist.py | 59 ++++++++++++++++++++++++-------
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 49c7c8f62506..8ce1a5d4d97f 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -113,12 +113,15 @@ def _make_bounded_dist(action_space):
         high = np.ravel(action_space.high)
 
         for l, h in zip(low, high):
-            if np.isinf(l) and np.isinf(h):
+            if not np.isinf(l) and not np.isinf(h):
                 dist = partial(GaussianSquashedGaussian, low=l, high=h)
             else:
                 dist = DiagGaussian
             child_dists.append(dist)
 
+        if len(child_dists) == 1:
+            return dist, 2
+
         return partial(
             MultiActionDistribution,
             action_space=action_space,
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index ecc3b1ed6827..bdabf27efb06 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -191,6 +191,8 @@ def required_model_output_shape(action_space, model_config):
 
 
 class _SquashedGaussianBase(TFActionDistribution):
+    """A univariate gaussian distribution, squashed into bounded support."""
+
     def __init__(self, inputs, model, low=-1.0, high=1.0):
         """Parameterizes the distribution via `inputs`.
 
@@ -201,12 +203,14 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
                 (excluding this value).
         """
         assert tfp is not None
-        loc, log_scale = tf.split(inputs, 2, axis=-1)
+        loc, log_scale = inputs[:, 0], inputs[:, 1]
         # Clip `scale` values (coming from NN) to reasonable values.
         self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
                                         MAX_LOG_NN_OUTPUT)
         scale = tf.exp(self.log_std)
         self.distr = tfp.distributions.Normal(loc=loc, scale=scale)
+        assert len(self.distr.loc.shape) == 1
+        assert len(self.distr.scale.shape) == 1
         assert np.all(np.less(low, high))
         self.low = low
         self.high = high
@@ -215,26 +219,59 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
     @override(ActionDistribution)
     def deterministic_sample(self):
         mean = self.distr.mean()
-        return self._squash(mean)
+        assert len(mean.shape) == 1, "Shape should be batch dim only"
+        s = self._squash(mean)
+        assert len(s.shape) == 1
+        return s[:, None]
 
     @override(ActionDistribution)
     def logp(self, x):
-        unsquashed_values = self._unsquash(x)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(value=unsquashed_values), axis=-1)
+        assert len(x.shape) >= 2, "First dim batch, second dim variable"
+        unsquashed_values = self._unsquash(x[:, 0])
+        log_prob = self.distr.log_prob(value=unsquashed_values)
         return log_prob - self._log_squash_grad(unsquashed_values)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
-        return self._squash(self.distr.sample())
+        s = self._squash(self.distr.sample())
+        assert len(s.shape) == 1
+        return s[:, None]
 
-    def _squash(self, raw_values):
+    def _squash(self, unsquashed_values):
+        """Squash an array element-wise into the (high, low) range
+        
+        Arguments:
+            unsquashed_values: values to be squashed
+
+        Returns:
+            The squashed values.  The output shape is `unsquashed_values.shape`
+
+        """
         raise NotImplementedError
 
     def _unsquash(self, values):
+        """Unsquash an array element-wise from the (high, low) range
+        
+        Arguments:
+            squashed_values: values to be unsquashed
+
+        Returns:
+            The unsquashed values.  The output shape is `squashed_values.shape`
+
+        """
         raise NotImplementedError
 
     def _log_squash_grad(self, unsquashed_values):
+        """Log gradient of _squash with respect to its argument.
+
+        Arguments:
+            squashed_values:  Point at which to measure the gradient.
+
+        Returns:
+            The gradient at the given point.  The output shape is
+            `squashed_values.shape`.
+
+        """
         raise NotImplementedError
 
 
@@ -258,9 +295,7 @@ def sampled_action_logp(self):
 
     def _log_squash_grad(self, unsquashed_values):
         unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
-        return tf.math.reduce_sum(
-            tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER),
-            axis=-1)
+        return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER)
 
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
@@ -271,7 +306,6 @@ def _squash(self, raw_values):
             1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \
                self.low
 
-
     def _unsquash(self, values):
         return tf.math.atanh((values - self.low) /
                              (self.high - self.low) * 2.0 - 1.0)
@@ -317,8 +351,7 @@ def entropy(self):
 
     def _log_squash_grad(self, unsquashed_values):
         squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
-        log_grad = tf.reduce_sum(
-            squash_dist.log_prob(value=unsquashed_values), axis=-1)
+        log_grad = squash_dist.log_prob(value=unsquashed_values)
         log_grad += tf.log(self.high - self.low)
         return log_grad
 

From f4521f7905d59e057167bc37d815b6e48f38c6e9 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Sun, 15 Mar 2020 16:12:15 +0000
Subject: [PATCH 06/21] Reformatting according to scripts/format.sh

---
 rllib/models/catalog.py           |  4 ++--
 rllib/models/tf/tf_action_dist.py | 16 +++++-----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 8ce1a5d4d97f..910068e3ca23 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -172,8 +172,8 @@ def get_action_dist(action_space,
                     "using a custom action distribution, "
                     "using a Tuple action space, or the multi-agent API.")
             if dist_type is None:
-                any_bounded = np.any(action_space.bounded_below &
-                                     action_space.bounded_above)
+                any_bounded = np.any(
+                    action_space.bounded_below & action_space.bounded_above)
                 if framework != "tf":
                     return TorchDiagGaussian
                 elif np.any(action_space.bounded_below &
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index bdabf27efb06..fd597e135131 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -346,8 +346,8 @@ def entropy(self):
 
         return (tf.log(self.high - self.low) -
                 (tf.log(self._SCALE) - self.log_std +
-                (tf.square(std) + tf.square(mean)) /
-                (2.0 * tf.square(self._SCALE)) - 0.5))
+                 (tf.square(std) + tf.square(mean)) /
+                 (2.0 * tf.square(self._SCALE)) - 0.5))
 
     def _log_squash_grad(self, unsquashed_values):
         squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
@@ -359,19 +359,13 @@ def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
         # return exactly 1.0/-1.0, which would lead to +/-inf log-probs).
 
-        values = tfp.bijectors.NormalCDF().forward(
-                raw_values / self._SCALE
-        )
-        return (tf.clip_by_value(values,
-                                 SMALL_NUMBER,
-                                 1.0 - SMALL_NUMBER) *
+        values = tfp.bijectors.NormalCDF().forward(raw_values / self._SCALE)
+        return (tf.clip_by_value(values, SMALL_NUMBER, 1.0 - SMALL_NUMBER) *
                 (self.high - self.low) + self.low)
 
     def _unsquash(self, values):
         return self._SCALE * tfp.bijectors.NormalCDF().inverse(
-            (values - self.low) / (self.high - self.low)
-        )
-
+            (values - self.low) / (self.high - self.low))
 
 
 class Deterministic(TFActionDistribution):

From b0c2323a1f87caf803c3582571f232ed7c3a37a3 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Tue, 14 Apr 2020 07:48:31 +0100
Subject: [PATCH 07/21] code review markup

---
 rllib/models/catalog.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 910068e3ca23..94d4a79be307 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -172,15 +172,13 @@ def get_action_dist(action_space,
                     "using a custom action distribution, "
                     "using a Tuple action space, or the multi-agent API.")
             if dist_type is None:
-                any_bounded = np.any(
-                    action_space.bounded_below & action_space.bounded_above)
                 if framework != "tf":
                     return TorchDiagGaussian
                 elif np.any(action_space.bounded_below &
                             action_space.bounded_above):
                     return ModelCatalog._make_bounded_dist(action_space)
                 else:
-                    dist = TorchDiagGaussian
+                    dist = DiagGaussian
             elif dist_type == "deterministic":
                 dist = Deterministic
         # Discrete Space -> Categorical.

From 0e161fc2920faf7c1eeba794a6a95bfbca359852 Mon Sep 17 00:00:00 2001
From: Matthew Earl <gitlab@matthewearl.com>
Date: Tue, 14 Apr 2020 11:19:19 +0100
Subject: [PATCH 08/21] Bound loc for numerical stability

---
 rllib/models/tf/tf_action_dist.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index fd597e135131..a810dd9f730a 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -207,6 +207,8 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
         # Clip `scale` values (coming from NN) to reasonable values.
         self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
                                         MAX_LOG_NN_OUTPUT)
+        # Clip loc too, for numerical stability reasons.
+        loc = tf.clip_by_value(loc, -3, 3)
         scale = tf.exp(self.log_std)
         self.distr = tfp.distributions.Normal(loc=loc, scale=scale)
         assert len(self.distr.loc.shape) == 1

From f226d2e3df2c711e22c9737a95f40b7a719da761 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Thu, 16 Apr 2020 20:10:24 +0100
Subject: [PATCH 09/21] Fix squashed gaussian unit test

---
 rllib/models/tests/test_distributions.py |  2 +-
 rllib/models/tf/tf_action_dist.py        | 28 +++++++++++++-----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index ebd3525acd62..9586b753275f 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -155,7 +155,7 @@ def test_squashed_gaussian(self):
             check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05)
 
             # NN output.
-            means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0],
+            means = np.array([[0.1, 0.2, 0.3, 0.4, 2.9],
                               [-0.1, -0.2, -0.3, -0.4, -1.0]])
             log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0],
                                  [0.7, -0.3, 0.4, -0.9, 2.0]])
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 32226b97c64e..1975f72788c3 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -256,7 +256,7 @@ def required_model_output_shape(action_space, model_config):
 
 
 class _SquashedGaussianBase(TFActionDistribution):
-    """A univariate gaussian distribution, squashed into bounded support."""
+    """A diagonal gaussian distribution, squashed into bounded support."""
 
     def __init__(self, inputs, model, low=-1.0, high=1.0):
         """Parameterizes the distribution via `inputs`.
@@ -268,16 +268,18 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
                 (excluding this value).
         """
         assert tfp is not None
-        loc, log_std = inputs[:, 0], inputs[:, 1]
+        mean, log_std = tf.split(inputs, 2, axis=-1)
+        self._num_vars = mean.shape[1]
+        assert log_std.shape[1] == self._num_vars
         # Clip `std` values (coming from NN) to reasonable values.
         self.log_std = tf.clip_by_value(log_std, MIN_LOG_NN_OUTPUT,
                                         MAX_LOG_NN_OUTPUT)
         # Clip loc too, for numerical stability reasons.
-        loc = tf.clip_by_value(loc, -3, 3)
+        mean = tf.clip_by_value(mean, -3, 3)
         std = tf.exp(self.log_std)
-        self.distr = tfp.distributions.Normal(loc=loc, scale=std)
-        assert len(self.distr.loc.shape) == 1
-        assert len(self.distr.scale.shape) == 1
+        self.distr = tfp.distributions.Normal(loc=mean, scale=std)
+        assert len(self.distr.loc.shape) == 2
+        assert len(self.distr.scale.shape) == 2
         assert np.all(np.less(low, high))
         self.low = low
         self.high = high
@@ -286,23 +288,23 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
     @override(ActionDistribution)
     def deterministic_sample(self):
         mean = self.distr.mean()
-        assert len(mean.shape) == 1, "Shape should be batch dim only"
+        assert len(mean.shape) == 2
         s = self._squash(mean)
-        assert len(s.shape) == 1
-        return s[:, None]
+        assert len(s.shape) == 2
+        return s
 
     @override(ActionDistribution)
     def logp(self, x):
         assert len(x.shape) >= 2, "First dim batch, second dim variable"
-        unsquashed_values = self._unsquash(x[:, 0])
+        unsquashed_values = self._unsquash(x)
         log_prob = self.distr.log_prob(value=unsquashed_values)
-        return log_prob - self._log_squash_grad(unsquashed_values)
+        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
         s = self._squash(self.distr.sample())
-        assert len(s.shape) == 1
-        return s[:, None]
+        assert len(s.shape) == 2
+        return s
 
     def _squash(self, unsquashed_values):
         """Squash an array element-wise into the (high, low) range

From 3e1d345347a022b78896a0a69f14edb192921811 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Thu, 16 Apr 2020 21:17:14 +0100
Subject: [PATCH 10/21] Fix gaussian squashed gaussian following the previous
 commit

---
 rllib/models/tf/tf_action_dist.py | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 1975f72788c3..0ecb364cf875 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -298,7 +298,7 @@ def logp(self, x):
         assert len(x.shape) >= 2, "First dim batch, second dim variable"
         unsquashed_values = self._unsquash(x)
         log_prob = self.distr.log_prob(value=unsquashed_values)
-        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1)
+        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
@@ -351,17 +351,6 @@ class SquashedGaussian(_SquashedGaussianBase):
     `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
     """
 
-    @override(TFActionDistribution)
-    def sampled_action_logp(self):
-        unsquashed_values = self._unsquash(self.sample_op)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(unsquashed_values), axis=-1)
-        unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
-        log_prob -= tf.math.reduce_sum(
-            tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER),
-            axis=-1)
-        return log_prob
-
     def _log_squash_grad(self, unsquashed_values):
         unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
         return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER)
@@ -401,9 +390,9 @@ def kl(self, other):
         other_mean = other.distr.loc
         other_std = other.distr.scale
 
-        return (other.log_std - self.log_std +
-                (tf.square(std) + tf.square(mean - other_mean)) /
-                (2.0 * tf.square(other_std)) - 0.5)
+        return tf.reduce_sum((other.log_std - self.log_std +
+                             (tf.square(std) + tf.square(mean - other_mean)) /
+                             (2.0 * tf.square(other_std)) - 0.5), axis=1)
 
     def entropy(self):
         # Entropy is:
@@ -413,10 +402,10 @@ def entropy(self):
         mean = self.distr.loc
         std = self.distr.scale
 
-        return (tf.log(self.high - self.low) -
-                (tf.log(self._SCALE) - self.log_std +
-                 (tf.square(std) + tf.square(mean)) /
-                 (2.0 * tf.square(self._SCALE)) - 0.5))
+        return tf.reduce_sum(tf.log(self.high - self.low) -
+                             (tf.log(self._SCALE) - self.log_std +
+                              (tf.square(std) + tf.square(mean)) /
+                              (2.0 * tf.square(self._SCALE)) - 0.5), axis=1)
 
     def _log_squash_grad(self, unsquashed_values):
         squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)

From 9c9b8bce10f2f04a25e1481f581f411569c00569 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Thu, 16 Apr 2020 22:36:50 +0100
Subject: [PATCH 11/21] add test for gaussian squashed gaussian

---
 rllib/models/tests/test_distributions.py | 31 +++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index 9586b753275f..605ab39b0de1 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -4,7 +4,7 @@
 import unittest
 
 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \
-    SquashedGaussian, GumbelSoftmax
+    GaussianSquashedGaussian, SquashedGaussian, GumbelSoftmax
 from ray.rllib.models.torch.torch_action_dist import TorchMultiCategorical, \
     TorchSquashedGaussian, TorchBeta
 from ray.rllib.utils import try_import_tf, try_import_torch
@@ -185,6 +185,35 @@ def test_squashed_gaussian(self):
                 outs = sess.run(outs)
             check(outs, log_prob, decimals=4)
 
+    def test_gaussian_squashed_gaussian(self):
+        for fw, sess in framework_iterator(frameworks="tf", session=True):
+            inputs1 = tf.constant([[-0.5, 0.2, np.log(0.1), np.log(0.5)],
+                                   [0.6, 0.8, np.log(0.7), np.log(0.8)],
+                                   [-10.0, 1.2, np.log(0.9), np.log(1.0)]])
+
+            inputs2 = tf.constant([[0.2, 0.3, np.log(0.2), np.log(0.4)],
+                                   [0.6, 0.8, np.log(0.7), np.log(0.8)],
+                                   [-11.0, 1.2, np.log(0.9), np.log(1.0)]])
+
+            gsg_dist1 = GaussianSquashedGaussian(inputs1, None)
+            gsg_dist2 = GaussianSquashedGaussian(inputs2, None)
+
+            # KL, entropy, and logp values have been verified empirically.
+            check(sess.run(gsg_dist1.kl(gsg_dist2)),
+                  np.array([6.532504, 0., 0.]))
+            check(sess.run(gsg_dist1.entropy()),
+                  np.array([-0.74827796, 0.7070056, -4.971432]))
+            x = tf.constant([[-0.3939393939393939]])
+            check(sess.run(gsg_dist1.logp(x)),
+                  np.array([0.736003, -3.1547096, -6.5595593]))
+
+            # This is just the squashed distribution means. Verified using
+            # _unsquash (which was itself verified as part of the logp test).
+            expected = np.array([[-0.41861248, 0.1745522],
+                                 [0.49179232, 0.62231755],
+                                 [-0.99906087, 0.81425166]])
+            check(sess.run(gsg_dist1.deterministic_sample()), expected)
+
     def test_beta(self):
         input_space = Box(-2.0, 1.0, shape=(200, 10))
         low, high = -1.0, 2.0

From 731afbd60b53e0758a2dde57e08ea738626dfe39 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 17 Apr 2020 10:05:32 +0100
Subject: [PATCH 12/21] linter fixes

---
 rllib/models/catalog.py           | 3 ++-
 rllib/models/tf/tf_action_dist.py | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 7b0ff999ed03..79b715b536c0 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -13,7 +13,8 @@
 from ray.rllib.models.tf.lstm_v1 import LSTM
 from ray.rllib.models.tf.modelv1_compat import make_v1_wrapper
 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \
-    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, GaussianSquashedGaussian
+    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \
+    GaussianSquashedGaussian
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.models.tf.visionnet_v1 import VisionNetwork
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 0ecb364cf875..9c819b26922d 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -298,7 +298,8 @@ def logp(self, x):
         assert len(x.shape) >= 2, "First dim batch, second dim variable"
         unsquashed_values = self._unsquash(x)
         log_prob = self.distr.log_prob(value=unsquashed_values)
-        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1)
+        return tf.reduce_sum(log_prob -
+                             self._log_squash_grad(unsquashed_values), axis=1)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
@@ -308,7 +309,7 @@ def _build_sample_op(self):
 
     def _squash(self, unsquashed_values):
         """Squash an array element-wise into the (high, low) range
-        
+
         Arguments:
             unsquashed_values: values to be squashed
 
@@ -320,7 +321,7 @@ def _squash(self, unsquashed_values):
 
     def _unsquash(self, values):
         """Unsquash an array element-wise from the (high, low) range
-        
+
         Arguments:
             squashed_values: values to be unsquashed
 

From 7e89931e284e70db00b1942a5f7ed529430d6a39 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 11 Jan 2021 22:58:26 +0100
Subject: [PATCH 13/21] WIP.

---
 rllib/models/catalog.py                  |  4 +--
 rllib/models/tests/test_distributions.py | 19 ++++++++------
 rllib/models/tf/tf_action_dist.py        | 33 ++++++++++++++++--------
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 6c9fa2ffa06c..eeb737bfdef8 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -218,9 +218,7 @@ def get_action_dist(
                             action_space.bounded_above):
                     return ModelCatalog._make_bounded_dist(action_space)
                 else:
-                    dist = DiagGaussian
-                #dist_cls = TorchDiagGaussian if framework == "torch" \
-                #    else DiagGaussian
+                    dist_cls = DiagGaussian
             elif dist_type == "deterministic":
                 dist_cls = TorchDeterministic if framework == "torch" \
                     else Deterministic
diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index 0a7efac07ec5..ec9186fd8b86 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -383,18 +383,21 @@ def test_gaussian_squashed_gaussian(self):
             gsg_dist2 = GaussianSquashedGaussian(inputs2, None)
 
             # KL, entropy, and logp values have been verified empirically.
-            check(sess.run(gsg_dist1.kl(gsg_dist2)),
-                  np.array([6.532504, 0., 0.]))
-            check(sess.run(gsg_dist1.entropy()),
-                  np.array([-0.74827796, 0.7070056, -4.971432]))
+            check(
+                sess.run(gsg_dist1.kl(gsg_dist2)), np.array([6.532504, 0.,
+                                                             0.]))
+            check(
+                sess.run(gsg_dist1.entropy()),
+                np.array([-0.74827796, 0.7070056, -4.971432]))
             x = tf.constant([[-0.3939393939393939]])
-            check(sess.run(gsg_dist1.logp(x)),
-                  np.array([0.736003, -3.1547096, -6.5595593]))
+            check(
+                sess.run(gsg_dist1.logp(x)),
+                np.array([0.736003, -3.1547096, -6.5595593]))
 
             # This is just the squashed distribution means. Verified using
             # _unsquash (which was itself verified as part of the logp test).
-            expected = np.array([[-0.41861248, 0.1745522],
-                                 [0.49179232, 0.62231755],
+            expected = np.array([[-0.41861248,
+                                  0.1745522], [0.49179232, 0.62231755],
                                  [-0.99906087, 0.81425166]])
             check(sess.run(gsg_dist1.deterministic_sample()), expected)
 
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 37372c9af907..eb796c697ef4 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -329,10 +329,10 @@ def logp(self, x: TensorType) -> TensorType:
         log_prob_gaussian = self.distr.log_prob(unsquashed_values)
         # For safety reasons, clamp somehow, only then sum up.
         log_prob_gaussian = tf.clip_by_value(log_prob_gaussian, -100, 100)
-        log_prob_gaussian = tf.reduce_sum(log_prob_gaussian, axis=-1)
         # Get log-prob for squashed Gaussian.
-        return tf.reduce_sum(log_prob_gaussian -
-                             self._log_squash_grad(unsquashed_values), axis=1)
+        return tf.reduce_sum(
+            log_prob_gaussian - self._log_squash_grad(unsquashed_values),
+            axis=-1)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
@@ -497,9 +497,11 @@ def kl(self, other):
         other_mean = other.distr.loc
         other_std = other.distr.scale
 
-        return tf.reduce_sum((other.log_std - self.log_std +
-                             (tf.square(std) + tf.square(mean - other_mean)) /
-                             (2.0 * tf.square(other_std)) - 0.5), axis=1)
+        return tf.reduce_sum(
+            (other.log_std - self.log_std +
+             (tf.math.square(std) + tf.math.square(mean - other_mean)) /
+             (2.0 * tf.math.square(other_std)) - 0.5),
+            axis=1)
 
     def entropy(self):
         # Entropy is:
@@ -509,15 +511,17 @@ def entropy(self):
         mean = self.distr.loc
         std = self.distr.scale
 
-        return tf.reduce_sum(tf.log(self.high - self.low) -
-                             (tf.log(self._SCALE) - self.log_std +
-                              (tf.square(std) + tf.square(mean)) /
-                              (2.0 * tf.square(self._SCALE)) - 0.5), axis=1)
+        return tf.reduce_sum(
+            tf.math.log(self.high - self.low) -
+            (tf.math.log(self._SCALE) - self.log_std +
+             (tf.math.square(std) + tf.math.square(mean)) /
+             (2.0 * tf.math.square(self._SCALE)) - 0.5),
+            axis=1)
 
     def _log_squash_grad(self, unsquashed_values):
         squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
         log_grad = squash_dist.log_prob(value=unsquashed_values)
-        log_grad += tf.log(self.high - self.low)
+        log_grad += tf.math.log(self.high - self.low)
         return log_grad
 
     def _squash(self, raw_values):
@@ -532,6 +536,13 @@ def _unsquash(self, values):
         return self._SCALE * tfp.bijectors.NormalCDF().inverse(
             (values - self.low) / (self.high - self.low))
 
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+            action_space: gym.Space,
+            model_config: ModelConfigDict) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape) * 2
+
 
 class Deterministic(TFActionDistribution):
     """Action distribution that returns the input values directly.

From 921843003ff09c4b0993b83ba96012a907ceff9f Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 11 Jan 2021 23:32:10 +0100
Subject: [PATCH 14/21] LINT.

---
 rllib/models/tests/test_distributions.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index ec9186fd8b86..8061ed7a9bbb 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -371,13 +371,15 @@ def test_diag_gaussian(self):
 
     def test_gaussian_squashed_gaussian(self):
         for fw, sess in framework_iterator(frameworks="tf", session=True):
-            inputs1 = tf.constant([[-0.5, 0.2, np.log(0.1), np.log(0.5)],
-                                   [0.6, 0.8, np.log(0.7), np.log(0.8)],
-                                   [-10.0, 1.2, np.log(0.9), np.log(1.0)]])
-
-            inputs2 = tf.constant([[0.2, 0.3, np.log(0.2), np.log(0.4)],
-                                   [0.6, 0.8, np.log(0.7), np.log(0.8)],
-                                   [-11.0, 1.2, np.log(0.9), np.log(1.0)]])
+            inputs1 = tf.constant([
+                [-0.5, 0.2, np.log(0.1), np.log(0.5)],
+                [0.6, 0.8, np.log(0.7), np.log(0.8)],
+                [-10.0, 1.2, np.log(0.9), np.log(1.0)]])
+
+            inputs2 = tf.constant([
+                [0.2, 0.3, np.log(0.2), np.log(0.4)],
+                [0.6, 0.8, np.log(0.7), np.log(0.8)],
+                [-11.0, 1.2, np.log(0.9), np.log(1.0)]])
 
             gsg_dist1 = GaussianSquashedGaussian(inputs1, None)
             gsg_dist2 = GaussianSquashedGaussian(inputs2, None)

From ed7d261274780f7836e7419d2dc3a9d83cbfa7b0 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 12 Jan 2021 12:28:07 +0100
Subject: [PATCH 15/21] Fix.

---
 rllib/models/catalog.py                  |  2 +-
 rllib/models/tests/test_distributions.py | 28 ++++++++++++++++--------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index eeb737bfdef8..317292ec6103 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -213,7 +213,7 @@ def get_action_dist(
                     "using a Tuple action space, or the multi-agent API.")
             if dist_type is None:
                 if framework == "torch":
-                    return TorchDiagGaussian
+                    dist_cls = TorchDiagGaussian
                 elif np.any(action_space.bounded_below &
                             action_space.bounded_above):
                     return ModelCatalog._make_bounded_dist(action_space)
diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index 8061ed7a9bbb..32814976303e 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -371,15 +371,25 @@ def test_diag_gaussian(self):
 
     def test_gaussian_squashed_gaussian(self):
         for fw, sess in framework_iterator(frameworks="tf", session=True):
-            inputs1 = tf.constant([
-                [-0.5, 0.2, np.log(0.1), np.log(0.5)],
-                [0.6, 0.8, np.log(0.7), np.log(0.8)],
-                [-10.0, 1.2, np.log(0.9), np.log(1.0)]])
-
-            inputs2 = tf.constant([
-                [0.2, 0.3, np.log(0.2), np.log(0.4)],
-                [0.6, 0.8, np.log(0.7), np.log(0.8)],
-                [-11.0, 1.2, np.log(0.9), np.log(1.0)]])
+            inputs1 = tf.constant([[-0.5, 0.2,
+                                    np.log(0.1),
+                                    np.log(0.5)],
+                                   [0.6, 0.8,
+                                    np.log(0.7),
+                                    np.log(0.8)],
+                                   [-10.0, 1.2,
+                                    np.log(0.9),
+                                    np.log(1.0)]])
+
+            inputs2 = tf.constant([[0.2, 0.3,
+                                    np.log(0.2),
+                                    np.log(0.4)],
+                                   [0.6, 0.8,
+                                    np.log(0.7),
+                                    np.log(0.8)],
+                                   [-11.0, 1.2,
+                                    np.log(0.9),
+                                    np.log(1.0)]])
 
             gsg_dist1 = GaussianSquashedGaussian(inputs1, None)
             gsg_dist2 = GaussianSquashedGaussian(inputs2, None)

From 6098ddaf26e78042bb8cca89e0e8a94fe5709ae6 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 12 Jan 2021 19:52:45 +0100
Subject: [PATCH 16/21] Torch version and LINT.

---
 rllib/models/catalog.py                  |  54 +++---
 rllib/models/tests/test_distributions.py |  72 ++++----
 rllib/models/tf/tf_action_dist.py        | 140 +++++++++-------
 rllib/models/torch/torch_action_dist.py  | 201 +++++++++++++++++++----
 4 files changed, 308 insertions(+), 159 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 317292ec6103..2be6446a55d1 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -16,7 +16,7 @@
     GaussianSquashedGaussian, \
     MultiActionDistribution, MultiCategorical
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \
-    TorchDeterministic, TorchDiagGaussian, \
+    TorchDeterministic, TorchDiagGaussian, TorchGaussianSquashedGaussian, \
     TorchMultiActionDistribution, TorchMultiCategorical
 from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI
 from ray.rllib.utils.deprecation import DEPRECATED_VALUE
@@ -211,14 +211,31 @@ def get_action_dist(
                     "Consider reshaping this into a single dimension, "
                     "using a custom action distribution, "
                     "using a Tuple action space, or the multi-agent API.")
+
             if dist_type is None:
-                if framework == "torch":
-                    dist_cls = TorchDiagGaussian
-                elif np.any(action_space.bounded_below &
-                            action_space.bounded_above):
-                    return ModelCatalog._make_bounded_dist(action_space)
+                cls = TorchGaussianSquashedGaussian if framework == "torch" \
+                    else GaussianSquashedGaussian
+                if np.any(action_space.bounded_below &
+                          action_space.bounded_above):
+                    if any(action_space.low != action_space.low[0]) or \
+                            any(action_space.high != action_space.high[0]):
+                        raise UnsupportedSpaceException(
+                            "The Box space has non-matching low/high value(s)."
+                            " Make sure that all low/high values are the same "
+                            "accross the different dimensions of your Box. If "
+                            "the different dimensions must have different "
+                            "low/high values, try splitting up your space into"
+                            " a Tuple or Dict space.")
+                    dist_cls = partial(
+                        cls,
+                        low=action_space.low[0],
+                        high=action_space.high[0])
+                    num_inputs = cls.required_model_output_shape(
+                        action_space, config)
+                    return dist_cls, num_inputs
                 else:
-                    dist_cls = DiagGaussian
+                    dist_cls = TorchDiagGaussian if framework == "torch" else \
+                        DiagGaussian
             elif dist_type == "deterministic":
                 dist_cls = TorchDeterministic if framework == "torch" \
                     else Deterministic
@@ -730,29 +747,6 @@ def _get_multi_action_distribution(dist_class, action_space, config,
                 input_lens=input_lens), int(sum(input_lens))
         return dist_class
 
-    @staticmethod
-    def _make_bounded_dist(action_space):
-        child_dists = []
-
-        low = np.ravel(action_space.low)
-        high = np.ravel(action_space.high)
-
-        for l, h in zip(low, high):
-            if not np.isinf(l) and not np.isinf(h):
-                dist = partial(GaussianSquashedGaussian, low=l, high=h)
-            else:
-                dist = DiagGaussian
-            child_dists.append(dist)
-
-        if len(child_dists) == 1:
-            return dist, 2
-
-        return partial(
-            MultiActionDistribution,
-            action_space=action_space,
-            child_distributions=child_dists,
-            input_lens=[2] * len(child_dists)), 2 * len(child_dists)
-
     @staticmethod
     def _validate_config(config: ModelConfigDict, framework: str) -> None:
         """Validates a given model config dict.
diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index 32814976303e..cac34560c589 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -10,8 +10,8 @@
     DiagGaussian, GaussianSquashedGaussian, GumbelSoftmax, \
     MultiActionDistribution, MultiCategorical, SquashedGaussian
 from ray.rllib.models.torch.torch_action_dist import TorchBeta, \
-    TorchCategorical, TorchDiagGaussian, TorchMultiActionDistribution, \
-    TorchMultiCategorical, TorchSquashedGaussian
+    TorchCategorical, TorchDiagGaussian, TorchGaussianSquashedGaussian, \
+    TorchMultiActionDistribution, TorchMultiCategorical, TorchSquashedGaussian
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.numpy import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, \
     softmax, SMALL_NUMBER, LARGE_INTEGER
@@ -370,48 +370,48 @@ def test_diag_gaussian(self):
             check(outs, log_prob, decimals=4)
 
     def test_gaussian_squashed_gaussian(self):
-        for fw, sess in framework_iterator(frameworks="tf", session=True):
-            inputs1 = tf.constant([[-0.5, 0.2,
-                                    np.log(0.1),
-                                    np.log(0.5)],
-                                   [0.6, 0.8,
-                                    np.log(0.7),
-                                    np.log(0.8)],
-                                   [-10.0, 1.2,
-                                    np.log(0.9),
-                                    np.log(1.0)]])
-
-            inputs2 = tf.constant([[0.2, 0.3,
-                                    np.log(0.2),
-                                    np.log(0.4)],
-                                   [0.6, 0.8,
-                                    np.log(0.7),
-                                    np.log(0.8)],
-                                   [-11.0, 1.2,
-                                    np.log(0.9),
-                                    np.log(1.0)]])
-
-            gsg_dist1 = GaussianSquashedGaussian(inputs1, None)
-            gsg_dist2 = GaussianSquashedGaussian(inputs2, None)
+        for fw, sess in framework_iterator(session=True):
+            inputs1 = np.array(
+                [[-0.5, 0.2, np.log(0.1), np.log(0.5)], [
+                    0.6, 0.8, np.log(0.7), np.log(0.8)
+                ], [-10.0, 1.2, np.log(0.9),
+                    np.log(1.0)]],
+                dtype=np.float32)
+
+            inputs2 = np.array(
+                [[0.2, 0.3, np.log(0.2), np.log(0.4)], [
+                    0.6, 0.8, np.log(0.7), np.log(0.8)
+                ], [-11.0, 1.2, np.log(0.9),
+                    np.log(1.0)]],
+                dtype=np.float32)
+
+            cls = GaussianSquashedGaussian if fw != "torch" else \
+                TorchGaussianSquashedGaussian
+            gsg_dist1 = cls(inputs1, None)
+            gsg_dist2 = cls(inputs2, None)
 
             # KL, entropy, and logp values have been verified empirically.
             check(
-                sess.run(gsg_dist1.kl(gsg_dist2)), np.array([6.532504, 0.,
-                                                             0.]))
+                gsg_dist1.kl(gsg_dist2),
+                np.array([6.532504, 0.0, 0.0], dtype=np.float32))
             check(
-                sess.run(gsg_dist1.entropy()),
-                np.array([-0.74827796, 0.7070056, -4.971432]))
-            x = tf.constant([[-0.3939393939393939]])
+                gsg_dist1.entropy(),
+                np.array(
+                    [-0.74827796, 0.7070056, -4.971432], dtype=np.float32))
+            x = np.array([[-0.3939393939393939]], dtype=np.float32)
+            if fw == "torch":
+                x = torch.from_numpy(x)
             check(
-                sess.run(gsg_dist1.logp(x)),
-                np.array([0.736003, -3.1547096, -6.5595593]))
+                gsg_dist1.logp(x),
+                np.array([0.736003, -3.1547096, -6.5595593], dtype=np.float32))
 
             # This is just the squashed distribution means. Verified using
             # _unsquash (which was itself verified as part of the logp test).
-            expected = np.array([[-0.41861248,
-                                  0.1745522], [0.49179232, 0.62231755],
-                                 [-0.99906087, 0.81425166]])
-            check(sess.run(gsg_dist1.deterministic_sample()), expected)
+            expected = np.array(
+                [[-0.41861248, 0.1745522], [0.49179232, 0.62231755],
+                 [-0.99906087, 0.81425166]],
+                dtype=np.float32)
+            check(gsg_dist1.deterministic_sample(), expected)
 
     def test_beta(self):
         input_space = Box(-2.0, 1.0, shape=(2000, 10))
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index eb796c697ef4..9e02bd06fb36 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -385,6 +385,7 @@ class SquashedGaussian(_SquashedGaussianBase):
     `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
     """
 
+    @override(_SquashedGaussianBase)
     def _log_squash_grad(self, unsquashed_values):
         unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
         return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER)
@@ -420,65 +421,13 @@ def required_model_output_shape(
         return np.prod(action_space.shape) * 2
 
 
-class Beta(TFActionDistribution):
-    """
-    A Beta distribution is defined on the interval [0, 1] and parameterized by
-    shape parameters alpha and beta (also called concentration parameters).
-
-    PDF(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z
-        with Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta)
-        and Gamma(n) = (n - 1)!
-    """
-
-    def __init__(self,
-                 inputs: List[TensorType],
-                 model: ModelV2,
-                 low: float = 0.0,
-                 high: float = 1.0):
-        # Stabilize input parameters (possibly coming from a linear layer).
-        inputs = tf.clip_by_value(inputs, log(SMALL_NUMBER),
-                                  -log(SMALL_NUMBER))
-        inputs = tf.math.log(tf.math.exp(inputs) + 1.0) + 1.0
-        self.low = low
-        self.high = high
-        alpha, beta = tf.split(inputs, 2, axis=-1)
-        # Note: concentration0==beta, concentration1=alpha (!)
-        self.dist = tfp.distributions.Beta(
-            concentration1=alpha, concentration0=beta)
-        super().__init__(inputs, model)
-
-    @override(ActionDistribution)
-    def deterministic_sample(self) -> TensorType:
-        mean = self.dist.mean()
-        return self._squash(mean)
-
-    @override(TFActionDistribution)
-    def _build_sample_op(self) -> TensorType:
-        return self._squash(self.dist.sample())
-
-    @override(ActionDistribution)
-    def logp(self, x: TensorType) -> TensorType:
-        unsquashed_values = self._unsquash(x)
-        return tf.math.reduce_sum(
-            self.dist.log_prob(unsquashed_values), axis=-1)
-
-    def _squash(self, raw_values: TensorType) -> TensorType:
-        return raw_values * (self.high - self.low) + self.low
-
-    def _unsquash(self, values: TensorType) -> TensorType:
-        return (values - self.low) / (self.high - self.low)
-
-    @staticmethod
-    @override(ActionDistribution)
-    def required_model_output_shape(
-            action_space: gym.Space,
-            model_config: ModelConfigDict) -> Union[int, np.ndarray]:
-        return np.prod(action_space.shape) * 2
-
-
 class GaussianSquashedGaussian(_SquashedGaussianBase):
     """A gaussian CDF-squashed Gaussian distribution.
 
+    Can be used instead of the `SquashedGaussian` in case entropy or KL need
+    to be computable in analytical form (`SquashedGaussian` can only provide
+    those empirically).
+
     The distribution will never return low or high exactly, but
     `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
     """
@@ -544,6 +493,62 @@ def required_model_output_shape(
         return np.prod(action_space.shape) * 2
 
 
+class Beta(TFActionDistribution):
+    """
+    A Beta distribution is defined on the interval [0, 1] and parameterized by
+    shape parameters alpha and beta (also called concentration parameters).
+
+    PDF(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z
+        with Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta)
+        and Gamma(n) = (n - 1)!
+    """
+
+    def __init__(self,
+                 inputs: List[TensorType],
+                 model: ModelV2,
+                 low: float = 0.0,
+                 high: float = 1.0):
+        # Stabilize input parameters (possibly coming from a linear layer).
+        inputs = tf.clip_by_value(inputs, log(SMALL_NUMBER),
+                                  -log(SMALL_NUMBER))
+        inputs = tf.math.log(tf.math.exp(inputs) + 1.0) + 1.0
+        self.low = low
+        self.high = high
+        alpha, beta = tf.split(inputs, 2, axis=-1)
+        # Note: concentration0==beta, concentration1=alpha (!)
+        self.dist = tfp.distributions.Beta(
+            concentration1=alpha, concentration0=beta)
+        super().__init__(inputs, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        mean = self.dist.mean()
+        return self._squash(mean)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        return self._squash(self.dist.sample())
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        unsquashed_values = self._unsquash(x)
+        return tf.math.reduce_sum(
+            self.dist.log_prob(unsquashed_values), axis=-1)
+
+    def _squash(self, raw_values: TensorType) -> TensorType:
+        return raw_values * (self.high - self.low) + self.low
+
+    def _unsquash(self, values: TensorType) -> TensorType:
+        return (values - self.low) / (self.high - self.low)
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+            action_space: gym.Space,
+            model_config: ModelConfigDict) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape) * 2
+
+
 class Deterministic(TFActionDistribution):
     """Action distribution that returns the input values directly.
 
@@ -573,15 +578,26 @@ def required_model_output_shape(
 
 class MultiActionDistribution(TFActionDistribution):
     """Action distribution that operates on a set of actions.
-
-    Args:
-        inputs (Tensor list): A list of tensors from which to compute samples.
     """
 
-    def __init__(self, inputs, model, *, child_distributions, input_lens,
-                 action_space):
-        ActionDistribution.__init__(self, inputs, model)
+    def __init__(self, inputs: List[TensorType], model: ModelV2, *,
+                 child_distributions: List[TFActionDistribution],
+                 input_lens: List[int], action_space: gym.spaces.Space):
+        """Initializes a MultiActionDistribution instance.
 
+        Args:
+            inputs (List[TensorType): A list of tensors from which to compute
+                samples.
+            child_distributions (List[TFActionDistribution]): Flattened list
+                of the child distributions within this multi distribution.
+            input_lens (List[int]): List of input vector lengths corresponding
+                to the list of `child_distributions`.
+            action_space (gym.spaces.Space): The (Tuple/Dict) action space
+                underlying this multi distribution.
+        """
+        ActionDistribution.__init__(self, inputs, model)
+        # The base struct (python dict/tuple) corresponding to the complex
+        # action space.
         self.action_space_struct = get_base_struct_from_space(action_space)
 
         self.input_lens = np.array(input_lens, dtype=np.int32)
diff --git a/rllib/models/torch/torch_action_dist.py b/rllib/models/torch/torch_action_dist.py
index ecc8aa276a3e..ff4df66710c3 100644
--- a/rllib/models/torch/torch_action_dist.py
+++ b/rllib/models/torch/torch_action_dist.py
@@ -184,12 +184,8 @@ def required_model_output_shape(
         return np.prod(action_space.shape) * 2
 
 
-class TorchSquashedGaussian(TorchDistributionWrapper):
-    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
-
-    The distribution will never return low or high exactly, but
-    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
-    """
+class _TorchSquashedGaussianBase(TorchDistributionWrapper):
+    """A diagonal gaussian distribution, squashed into bounded support."""
 
     def __init__(self,
                  inputs: List[TensorType],
@@ -205,49 +201,112 @@ def __init__(self,
                 (excluding this value).
         """
         super().__init__(inputs, model)
-        # Split inputs into mean and log(std).
+
+        assert low < high
+        # Make sure high and low are torch tensors.
+        self.low = torch.from_numpy(np.array(low))
+        self.high = torch.from_numpy(np.array(high))
+        # Place on correct device.
+        if isinstance(model, TorchModelV2):
+            device = next(model.parameters()).device
+            self.low = self.low.to(device)
+            self.high = self.high.to(device)
+
         mean, log_std = torch.chunk(self.inputs, 2, dim=-1)
-        # Clip `scale` values (coming from NN) to reasonable values.
-        log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)
-        std = torch.exp(log_std)
-        self.dist = torch.distributions.normal.Normal(mean, std)
-        assert np.all(np.less(low, high))
-        self.low = low
-        self.high = high
+        self._num_vars = mean.shape[1]
+        assert log_std.shape[1] == self._num_vars
+        # Clip `std` values (coming from NN) to reasonable values.
+        self.log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT,
+                                   MAX_LOG_NN_OUTPUT)
+        # Clip loc too, for numerical stability reasons.
+        mean = torch.clamp(mean, -3, 3)
+        std = torch.exp(self.log_std)
+        self.distr = torch.distributions.normal.Normal(mean, std)
+        assert len(self.distr.loc.shape) == 2
+        assert len(self.distr.scale.shape) == 2
 
     @override(ActionDistribution)
     def deterministic_sample(self) -> TensorType:
-        self.last_sample = self._squash(self.dist.mean)
-        return self.last_sample
-
-    @override(TorchDistributionWrapper)
-    def sample(self) -> TensorType:
-        # Use the reparameterization version of `dist.sample` to allow for
-        # the results to be backprop'able e.g. in a loss term.
-        normal_sample = self.dist.rsample()
-        self.last_sample = self._squash(normal_sample)
-        return self.last_sample
+        mean = self.distr.loc
+        assert len(mean.shape) == 2
+        s = self._squash(mean)
+        assert len(s.shape) == 2
+        return s
 
     @override(ActionDistribution)
     def logp(self, x: TensorType) -> TensorType:
         # Unsquash values (from [low,high] to ]-inf,inf[)
+        assert len(x.shape) >= 2, "First dim batch, second dim variable"
         unsquashed_values = self._unsquash(x)
         # Get log prob of unsquashed values from our Normal.
-        log_prob_gaussian = self.dist.log_prob(unsquashed_values)
+        log_prob_gaussian = self.distr.log_prob(unsquashed_values)
         # For safety reasons, clamp somehow, only then sum up.
         log_prob_gaussian = torch.clamp(log_prob_gaussian, -100, 100)
-        log_prob_gaussian = torch.sum(log_prob_gaussian, dim=-1)
         # Get log-prob for squashed Gaussian.
-        unsquashed_values_tanhd = torch.tanh(unsquashed_values)
-        log_prob = log_prob_gaussian - torch.sum(
-            torch.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), dim=-1)
-        return log_prob
+        return torch.sum(
+            log_prob_gaussian - self._log_squash_grad(unsquashed_values),
+            dim=-1)
 
     @override(TorchDistributionWrapper)
+    def sample(self):
+        s = self._squash(self.distr.sample())
+        assert len(s.shape) == 2
+        return s
+
+    def _squash(self, unsquashed_values):
+        """Squash an array element-wise into the (high, low) range
+
+        Arguments:
+            unsquashed_values: values to be squashed
+
+        Returns:
+            The squashed values.  The output shape is `unsquashed_values.shape`
+
+        """
+        raise NotImplementedError
+
+    def _unsquash(self, values):
+        """Unsquash an array element-wise from the (high, low) range
+
+        Arguments:
+            squashed_values: values to be unsquashed
+
+        Returns:
+            The unsquashed values.  The output shape is `squashed_values.shape`
+
+        """
+        raise NotImplementedError
+
+    def _log_squash_grad(self, unsquashed_values):
+        """Log gradient of _squash with respect to its argument.
+
+        Arguments:
+            squashed_values:  Point at which to measure the gradient.
+
+        Returns:
+            The gradient at the given point.  The output shape is
+            `squashed_values.shape`.
+
+        """
+        raise NotImplementedError
+
+
+class TorchSquashedGaussian(_TorchSquashedGaussianBase):
+    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+
+    def _log_squash_grad(self, unsquashed_values):
+        unsquashed_values_tanhd = torch.tanh(unsquashed_values)
+        return torch.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER)
+
+    @override(ActionDistribution)
     def entropy(self) -> TensorType:
         raise ValueError("Entropy not defined for SquashedGaussian!")
 
-    @override(TorchDistributionWrapper)
+    @override(ActionDistribution)
     def kl(self, other: ActionDistribution) -> TensorType:
         raise ValueError("KL not defined for SquashedGaussian!")
 
@@ -274,6 +333,86 @@ def required_model_output_shape(
         return np.prod(action_space.shape) * 2
 
 
+class TorchGaussianSquashedGaussian(_TorchSquashedGaussianBase):
+    """A gaussian CDF-squashed Gaussian distribution.
+
+    Can be used instead of the `SquashedGaussian` in case entropy or KL need
+    to be computable in analytical form (`SquashedGaussian` can only provide
+    those empirically).
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+    # Chosen to match the standard logistic variance, so that:
+    #   Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1))
+    _SCALE = 0.5 * 1.8137
+    SQUASH_DIST = torch.distributions.normal.Normal(0.0, _SCALE)
+
+    @override(_TorchSquashedGaussianBase)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.scale = torch.from_numpy(np.array(self._SCALE))
+        if self.model:
+            self.scale = self.scale.to(
+                next(iter(self.model.parameters)).device)
+
+    @override(ActionDistribution)
+    def kl(self, other):
+        # KL(self || other) is just the KL of the two unsquashed distributions.
+        assert isinstance(other, TorchGaussianSquashedGaussian)
+
+        mean = self.distr.loc
+        std = self.distr.scale
+
+        other_mean = other.distr.loc
+        other_std = other.distr.scale
+
+        return torch.sum(
+            (other.log_std - self.log_std +
+             (torch.pow(std, 2.0) + torch.pow(mean - other_mean, 2.0)) /
+             (2.0 * torch.pow(other_std, 2.0)) - 0.5),
+            axis=1)
+
+    def entropy(self):
+        # Entropy is:
+        #   -KL(self.distr || N(0, _SCALE)) + log(high - low)
+        # where the latter distribution's CDF is used to do the squashing.
+
+        mean = self.distr.loc
+        std = self.distr.scale
+
+        return torch.sum(
+            torch.log(self.high - self.low) -
+            (torch.log(self.scale) - self.log_std +
+             (torch.pow(std, 2.0) + torch.pow(mean, 2.0)) /
+             (2.0 * torch.pow(self.scale, 2.0)) - 0.5),
+            dim=1)
+
+    def _log_squash_grad(self, unsquashed_values):
+        log_grad = self.SQUASH_DIST.log_prob(value=unsquashed_values)
+        log_grad += torch.log(self.high - self.low)
+        return log_grad
+
+    def _squash(self, raw_values):
+        # Make sure raw_values are not too high/low (such that tanh would
+        # return exactly 1.0/-1.0, which would lead to +/-inf log-probs).
+
+        values = self.SQUASH_DIST.cdf(raw_values)  # / self._SCALE)
+        return (torch.clamp(values, SMALL_NUMBER, 1.0 - SMALL_NUMBER) *
+                (self.high - self.low) + self.low)
+
+    def _unsquash(self, values):
+        x = (values - self.low) / (self.high - self.low)
+        return self.SQUASH_DIST.icdf(x)
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+            action_space: gym.Space,
+            model_config: ModelConfigDict) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape) * 2
+
+
 class TorchBeta(TorchDistributionWrapper):
     """
     A Beta distribution is defined on the interval [0, 1] and parameterized by

From 37f6986dfd3c544d0dbaa2170388881e0ef9a538 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 12 Jan 2021 22:19:02 +0100
Subject: [PATCH 17/21] LINT.

---
 rllib/models/torch/torch_action_dist.py | 19 +++++++++++--------
 rllib/tests/run_regression_tests.py     |  6 +++++-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/rllib/models/torch/torch_action_dist.py b/rllib/models/torch/torch_action_dist.py
index ff4df66710c3..2dc7e72823d9 100644
--- a/rllib/models/torch/torch_action_dist.py
+++ b/rllib/models/torch/torch_action_dist.py
@@ -225,12 +225,20 @@ def __init__(self,
         assert len(self.distr.loc.shape) == 2
         assert len(self.distr.scale.shape) == 2
 
+    @override(TorchDistributionWrapper)
+    def sample(self):
+        s = self._squash(self.distr.sample())
+        assert len(s.shape) == 2
+        self.last_sample = s
+        return s
+
     @override(ActionDistribution)
     def deterministic_sample(self) -> TensorType:
         mean = self.distr.loc
         assert len(mean.shape) == 2
         s = self._squash(mean)
         assert len(s.shape) == 2
+        self.last_sample = s
         return s
 
     @override(ActionDistribution)
@@ -247,12 +255,6 @@ def logp(self, x: TensorType) -> TensorType:
             log_prob_gaussian - self._log_squash_grad(unsquashed_values),
             dim=-1)
 
-    @override(TorchDistributionWrapper)
-    def sample(self):
-        s = self._squash(self.distr.sample())
-        assert len(s.shape) == 2
-        return s
-
     def _squash(self, unsquashed_values):
         """Squash an array element-wise into the (high, low) range
 
@@ -346,7 +348,8 @@ class TorchGaussianSquashedGaussian(_TorchSquashedGaussianBase):
     # Chosen to match the standard logistic variance, so that:
     #   Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1))
     _SCALE = 0.5 * 1.8137
-    SQUASH_DIST = torch.distributions.normal.Normal(0.0, _SCALE)
+    SQUASH_DIST = \
+        torch.distributions.normal.Normal(0.0, _SCALE) if torch else None
 
     @override(_TorchSquashedGaussianBase)
     def __init__(self, *args, **kwargs):
@@ -354,7 +357,7 @@ def __init__(self, *args, **kwargs):
         self.scale = torch.from_numpy(np.array(self._SCALE))
         if self.model:
             self.scale = self.scale.to(
-                next(iter(self.model.parameters)).device)
+                next(iter(self.model.parameters())).device)
 
     @override(ActionDistribution)
     def kl(self, other):
diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py
index 3f42147e4071..d5c2a73453a7 100644
--- a/rllib/tests/run_regression_tests.py
+++ b/rllib/tests/run_regression_tests.py
@@ -43,6 +43,10 @@
     "--torch",
     action="store_true",
     help="Runs all tests with PyTorch enabled.")
+parser.add_argument(
+    "--local-mode",
+    action="store_true",
+    help="Whether to run ray with `local_mode=True`.")
 
 if __name__ == "__main__":
     args = parser.parse_args()
@@ -92,7 +96,7 @@
         passed = False
         for i in range(3):
             try:
-                ray.init(num_cpus=5)
+                ray.init(num_cpus=5, local_mode=args.local_mode)
                 trials = run_experiments(experiments, resume=False, verbose=2)
             finally:
                 ray.shutdown()

From 32f42015628c0c870031623e930c6d0a2ec51a54 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 13 Jan 2021 09:02:22 +0100
Subject: [PATCH 18/21] Fix and LINT.

---
 rllib/models/catalog.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 2be6446a55d1..6b7b9226058d 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -217,8 +217,10 @@ def get_action_dist(
                     else GaussianSquashedGaussian
                 if np.any(action_space.bounded_below &
                           action_space.bounded_above):
-                    if any(action_space.low != action_space.low[0]) or \
-                            any(action_space.high != action_space.high[0]):
+                    lo = np.min(action_space.low)
+                    hi = np.max(action_space.high)
+                    if any(action_space.low != lo) or \
+                            any(action_space.high != hi):
                         raise UnsupportedSpaceException(
                             "The Box space has non-matching low/high value(s)."
                             " Make sure that all low/high values are the same "
@@ -226,10 +228,7 @@ def get_action_dist(
                             "the different dimensions must have different "
                             "low/high values, try splitting up your space into"
                             " a Tuple or Dict space.")
-                    dist_cls = partial(
-                        cls,
-                        low=action_space.low[0],
-                        high=action_space.high[0])
+                    dist_cls = partial(cls, low=lo, high=hi)
                     num_inputs = cls.required_model_output_shape(
                         action_space, config)
                     return dist_cls, num_inputs

From c61739c3a28f30ff29cb87d00312f81c42c5995e Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 13 Jan 2021 13:49:36 +0100
Subject: [PATCH 19/21] wip

---
 rllib/BUILD                                        | 14 +++++++-------
 rllib/models/catalog.py                            |  4 ++--
 .../tests/test_model_catalog.py}                   |  0
 3 files changed, 9 insertions(+), 9 deletions(-)
 rename rllib/{tests/test_catalog.py => models/tests/test_model_catalog.py} (100%)

diff --git a/rllib/BUILD b/rllib/BUILD
index 199cc5ad975e..7ec9225ed7a6 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1074,6 +1074,13 @@ py_test(
 # Tag: models
 # --------------------------------------------------------------------
 
+py_test(
+    name = "tests/test_model_catalog",
+    tags = ["models"],
+    size = "small",
+    srcs = ["models/tests/test_model_catalog.py"]
+)
+
 py_test(
     name = "test_convtranspose2d_stack",
     tags = ["models"],
@@ -1222,13 +1229,6 @@ py_test(
     srcs = ["tests/test_attention_net_learning.py"]
 )
 
-py_test(
-    name = "tests/test_catalog",
-    tags = ["tests_dir", "tests_dir_C"],
-    size = "medium",
-    srcs = ["tests/test_catalog.py"]
-)
-
 py_test(
     name = "tests/test_checkpoint_restore",
     tags = ["tests_dir", "tests_dir_C"],
diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 99d82512dc5f..ffab27c0413a 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -231,8 +231,8 @@ def get_action_dist(
                           action_space.bounded_above):
                     lo = np.min(action_space.low)
                     hi = np.max(action_space.high)
-                    if any(action_space.low != lo) or \
-                            any(action_space.high != hi):
+                    if np.any(action_space.low != lo) or \
+                            np.any(action_space.high != hi):
                         raise UnsupportedSpaceException(
                             "The Box space has non-matching low/high value(s)."
                             " Make sure that all low/high values are the same "
diff --git a/rllib/tests/test_catalog.py b/rllib/models/tests/test_model_catalog.py
similarity index 100%
rename from rllib/tests/test_catalog.py
rename to rllib/models/tests/test_model_catalog.py

From ec3b6dcd39e97cdd5875ca506997f4bef73bb6ee Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 11 Apr 2021 18:38:39 +0200
Subject: [PATCH 20/21] LINT.

---
 rllib/models/catalog.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 21f8bb3cd8d8..c82be2585708 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -186,7 +186,7 @@ def get_action_dist(
             config: ModelConfigDict,
             dist_type: Optional[Union[str, Type[ActionDistribution]]] = None,
             framework: str = "tf",
-            **kwargs) -> (type, int):
+            **kwargs) -> Tuple[Type[ActionDistribution], int]:
         """Returns a distribution class and size for the given action space.
 
         Args:
@@ -200,11 +200,9 @@ def get_action_dist(
                 constructor.
 
         Returns:
-            Tuple:
-                - dist_class (ActionDistribution): Python class of the
-                    distribution.
-                - dist_dim (int): The size of the input vector to the
-                    distribution.
+            Tuple[Type[ActionDistribution], int]: Python class of the
+                distribution and the size of the input vector to the
+                distribution.
         """
 
         dist_cls = None
@@ -250,8 +248,8 @@ def get_action_dist(
                         "using a Tuple action space, or the multi-agent API.")
 
                 if dist_type is None:
-                    cls = TorchGaussianSquashedGaussian if framework == "torch" \
-                        else GaussianSquashedGaussian
+                    cls = TorchGaussianSquashedGaussian if \
+                        framework == "torch" else GaussianSquashedGaussian
                     if np.any(action_space.bounded_below &
                               action_space.bounded_above):
                         lo = np.min(action_space.low)
@@ -259,19 +257,20 @@ def get_action_dist(
                         if np.any(action_space.low != lo) or \
                                 np.any(action_space.high != hi):
                             raise UnsupportedSpaceException(
-                                "The Box space has non-matching low/high value(s)."
-                                " Make sure that all low/high values are the same "
-                                "accross the different dimensions of your Box. If "
-                                "the different dimensions must have different "
-                                "low/high values, try splitting up your space into"
-                                " a Tuple or Dict space.")
+                                "The Box space has non-matching low/high "
+                                "value(s). Make sure that all low/high "
+                                "values are the same accross the different "
+                                "dimensions of your Box. If the different "
+                                "dimensions must have different low/high "
+                                "values, try splitting up your space into "
+                                "a Tuple or Dict space.")
                         dist_cls = partial(cls, low=lo, high=hi)
                         num_inputs = cls.required_model_output_shape(
                             action_space, config)
                         return dist_cls, num_inputs
                     else:
-                        dist_cls = TorchDiagGaussian if framework == "torch" else \
-                            DiagGaussian
+                        dist_cls = TorchDiagGaussian if \
+                            framework == "torch" else DiagGaussian
                 elif dist_type == "deterministic":
                     dist_cls = TorchDeterministic if framework == "torch" \
                         else Deterministic

From 4878362f7bdca492fe8c41e81b4682aa40b6337d Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 11 Apr 2021 20:16:29 +0200
Subject: [PATCH 21/21] fix and LINT.

---
 rllib/models/catalog.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index c82be2585708..d2817ac93265 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -186,7 +186,7 @@ def get_action_dist(
             config: ModelConfigDict,
             dist_type: Optional[Union[str, Type[ActionDistribution]]] = None,
             framework: str = "tf",
-            **kwargs) -> Tuple[Type[ActionDistribution], int]:
+            **kwargs) -> (Type[ActionDistribution], int):
         """Returns a distribution class and size for the given action space.
 
         Args: