From 8e63d3c1b5e5f4558594b325211fca1c170a22dd Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 09:29:47 +0000
Subject: [PATCH 01/12] Implement GaussianSquashedGaussian.  Still buggy

---
 rllib/models/tf/tf_action_dist.py | 117 +++++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 19 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 2bac4f4bc52e..8fb4d9ce8df2 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -190,13 +190,7 @@ def required_model_output_shape(action_space, model_config):
         return np.prod(action_space.shape) * 2
 
 
-class SquashedGaussian(TFActionDistribution):
-    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
-
-    The distribution will never return low or high exactly, but
-    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
-    """
-
+class _SquashedGaussianBase(TFActionDistribution):
     def __init__(self, inputs, model, low=-1.0, high=1.0):
         """Parameterizes the distribution via `inputs`.
 
@@ -209,15 +203,38 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
         assert tfp is not None
         loc, log_scale = tf.split(inputs, 2, axis=-1)
         # Clip `scale` values (coming from NN) to reasonable values.
-        log_scale = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
-                                     MAX_LOG_NN_OUTPUT)
-        scale = tf.exp(log_scale)
+        self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
+                                        MAX_LOG_NN_OUTPUT)
+        scale = tf.exp(self.log_std)
         self.distr = tfp.distributions.Normal(loc=loc, scale=scale)
         assert np.all(np.less(low, high))
         self.low = low
         self.high = high
         super().__init__(inputs, model)
 
+    @override(ActionDistribution)
+    def deterministic_sample(self):
+        mean = self.distr.mean()
+        return self._squash(mean)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self):
+        return self._squash(self.distr.sample())
+
+    def _squash(self, raw_values):
+        raise NotImplementedError
+
+    def _unsquash(self, values):
+        raise NotImplementedError
+
+
+class SquashedGaussian(_SquashedGaussianBase):
+    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+
     @override(TFActionDistribution)
     def sampled_action_logp(self):
         unsquashed_values = self._unsquash(self.sample_op)
@@ -229,15 +246,6 @@ def sampled_action_logp(self):
             axis=-1)
         return log_prob
 
-    @override(ActionDistribution)
-    def deterministic_sample(self):
-        mean = self.distr.mean()
-        return self._squash(mean)
-
-    @override(TFActionDistribution)
-    def _build_sample_op(self):
-        return self._squash(self.distr.sample())
-
     @override(ActionDistribution)
     def logp(self, x):
         unsquashed_values = self._unsquash(x)
@@ -263,6 +271,77 @@ def _unsquash(self, values):
                              (self.high - self.low) * 2.0 - 1.0)
 
 
+class GaussianSquashedGaussian(_SquashedGaussianBase):
+    """A gaussian CDF-squashed Gaussian distribution.
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+    # Chosen to match the standard logistic variance, so that:
+    #   Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1))
+    _SCALE = 0.5 * 1.8137
+
+    @override(ActionDistribution)
+    def logp(self, x):
+        unsquashed_values = self._unsquash(x)
+        log_prob = tf.reduce_sum(
+            self.distr.log_prob(value=unsquashed_values), axis=-1)
+        u = (unsquashed_values - self.low) / (self.high - self.low)
+        dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
+        log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1)
+        log_prob += tf.log(self.high - self.low)
+        return log_prob
+
+    @override(ActionDistribution)
+    def kl(self, other):
+        # KL(self || other) is just the KL of the two unsquashed distributions.
+        assert isinstance(other, GaussianSquashedGaussian)
+
+        mean = self.distr.mean()
+        std = self.distr.std()
+
+        other_mean = other.distr.mean()
+        other_std = other.distr.std()
+
+        return tf.reduce_sum(
+            other.log_std - self.log_std +
+            (tf.square(std) + tf.square(mean - other_mean)) /
+            (2.0 * tf.square(other_std)) - 0.5,
+            axis=1)
+
+    def entropy(self):
+        # Entropy is:
+        #   -KL(self.distr || N(0, _SCALE)) + log(high - low)
+        # where the latter distribution's CDF is used to do the squashing.
+
+        mean = self.distr.mean()
+        std = self.distr.std()
+
+        return tf.reduce_sum(
+            log(self.high - self.low) -
+            (tf.log(self._SCALE) - self.log_std +
+             (tf.square(std) + tf.square(mean)) /
+             (2.0 * tf.square(self._SCALE)) - 0.5))
+
+    def _squash(self, raw_values):
+        # Make sure raw_values are not too high/low (such that tanh would
+        # return exactly 1.0/-1.0, which would lead to +/-inf log-probs).
+
+        values = tfp.bijectors.NormalCDF().forward(
+                raw_values / self._SCALE
+        )
+        return (tf.clip_by_value(values,
+                                 SMALL_NUMBER,
+                                 1.0 - SMALL_NUMBER) *
+                (self.high - self.low) + self.low)
+
+    def _unsquash(self, values):
+        return self._SCALE * tfp.bijectors.NormalCDF().inverse(
+            (values - self.low) / (self.high - self.low)
+        )
+
+
+
 class Deterministic(TFActionDistribution):
     """Action distribution that returns the input values directly.
 

From 005c52420230013db6de5d2f840c616e18e8be75 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 10:00:24 +0000
Subject: [PATCH 02/12] fix bug in gsg logp

---
 rllib/models/tf/tf_action_dist.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 8fb4d9ce8df2..81c1cb5a0769 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -286,12 +286,12 @@ def logp(self, x):
         unsquashed_values = self._unsquash(x)
         log_prob = tf.reduce_sum(
             self.distr.log_prob(value=unsquashed_values), axis=-1)
-        u = (unsquashed_values - self.low) / (self.high - self.low)
-        dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
-        log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1)
-        log_prob += tf.log(self.high - self.low)
+        squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
+        log_prob -= tf.reduce_sum(
+            squash_dist.log_prob(value=unsquashed_values), axis=-1)
+        log_prob -= tf.log(self.high - self.low)
         return log_prob
-
+        
     @override(ActionDistribution)
     def kl(self, other):
         # KL(self || other) is just the KL of the two unsquashed distributions.

From ba69bb7ceecb4d2ea19e6c1ca9870f3ce4ae0423 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 18:44:11 +0000
Subject: [PATCH 03/12] Fix bugs in KL and entropy methods

---
 rllib/models/tf/tf_action_dist.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 81c1cb5a0769..faad2a4093b8 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -297,31 +297,28 @@ def kl(self, other):
         # KL(self || other) is just the KL of the two unsquashed distributions.
         assert isinstance(other, GaussianSquashedGaussian)
 
-        mean = self.distr.mean()
-        std = self.distr.std()
+        mean = self.distr.loc
+        std = self.distr.scale
 
-        other_mean = other.distr.mean()
-        other_std = other.distr.std()
+        other_mean = other.distr.loc
+        other_std = other.distr.scale
 
-        return tf.reduce_sum(
-            other.log_std - self.log_std +
-            (tf.square(std) + tf.square(mean - other_mean)) /
-            (2.0 * tf.square(other_std)) - 0.5,
-            axis=1)
+        return (other.log_std - self.log_std +
+                (tf.square(std) + tf.square(mean - other_mean)) /
+                (2.0 * tf.square(other_std)) - 0.5)
 
     def entropy(self):
         # Entropy is:
         #   -KL(self.distr || N(0, _SCALE)) + log(high - low)
         # where the latter distribution's CDF is used to do the squashing.
 
-        mean = self.distr.mean()
-        std = self.distr.std()
+        mean = self.distr.loc
+        std = self.distr.scale
 
-        return tf.reduce_sum(
-            log(self.high - self.low) -
-            (tf.log(self._SCALE) - self.log_std +
-             (tf.square(std) + tf.square(mean)) /
-             (2.0 * tf.square(self._SCALE)) - 0.5))
+        return (tf.log(self.high - self.low) -
+                (tf.log(self._SCALE) - self.log_std +
+                (tf.square(std) + tf.square(mean)) /
+                (2.0 * tf.square(self._SCALE)) - 0.5))
 
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would

From 113fc4ff47a68e46b6409d4c43b61bbd3964e484 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 13 Mar 2020 20:54:56 +0000
Subject: [PATCH 04/12] Initial attempt at integrating GSG into catalog

Still some bugs to fix
---
 rllib/models/catalog.py           | 34 +++++++++++++++++++++++---
 rllib/models/tf/tf_action_dist.py | 40 ++++++++++++++++---------------
 2 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 4fd864fde3ae..49c7c8f62506 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -13,7 +13,8 @@
 from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork as FCNetV2
 from ray.rllib.models.tf.visionnet_v2 import VisionNetwork as VisionNetV2
 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \
-    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet
+    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \
+    GaussianSquashedGaussian
 from ray.rllib.models.preprocessors import get_preprocessor
 from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork
 from ray.rllib.models.tf.lstm_v1 import LSTM
@@ -104,6 +105,26 @@ class ModelCatalog:
         >>> action = dist.sample()
     """
 
+    @staticmethod
+    def _make_bounded_dist(action_space):
+        child_dists = []
+
+        low = np.ravel(action_space.low)
+        high = np.ravel(action_space.high)
+
+        for l, h in zip(low, high):
+            if np.isinf(l) and np.isinf(h):
+                dist = partial(GaussianSquashedGaussian, low=l, high=h)
+            else:
+                dist = DiagGaussian
+            child_dists.append(dist)
+
+        return partial(
+            MultiActionDistribution,
+            action_space=action_space,
+            child_distributions=child_dists,
+            input_lens=[2] * len(child_dists)), 2 * len(child_dists)
+
     @staticmethod
     @DeveloperAPI
     def get_action_dist(action_space,
@@ -147,9 +168,16 @@ def get_action_dist(action_space,
                     "Consider reshaping this into a single dimension, "
                     "using a custom action distribution, "
                     "using a Tuple action space, or the multi-agent API.")
-            # TODO(sven): Check for bounds and return SquashedNormal, etc..
             if dist_type is None:
-                dist = DiagGaussian if framework == "tf" else TorchDiagGaussian
+                any_bounded = np.any(action_space.bounded_below &
+                                     action_space.bounded_above)
+                if framework != "tf":
+                    return TorchDiagGaussian
+                elif np.any(action_space.bounded_below &
+                            action_space.bounded_above):
+                    return ModelCatalog._make_bounded_dist(action_space)
+                else:
+                    dist = TorchDiagGaussian
             elif dist_type == "deterministic":
                 dist = Deterministic
         # Discrete Space -> Categorical.
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index faad2a4093b8..ecc3b1ed6827 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -217,6 +217,13 @@ def deterministic_sample(self):
         mean = self.distr.mean()
         return self._squash(mean)
 
+    @override(ActionDistribution)
+    def logp(self, x):
+        unsquashed_values = self._unsquash(x)
+        log_prob = tf.reduce_sum(
+            self.distr.log_prob(value=unsquashed_values), axis=-1)
+        return log_prob - self._log_squash_grad(unsquashed_values)
+
     @override(TFActionDistribution)
     def _build_sample_op(self):
         return self._squash(self.distr.sample())
@@ -227,6 +234,9 @@ def _squash(self, raw_values):
     def _unsquash(self, values):
         raise NotImplementedError
 
+    def _log_squash_grad(self, unsquashed_values):
+        raise NotImplementedError
+
 
 class SquashedGaussian(_SquashedGaussianBase):
     """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
@@ -246,16 +256,11 @@ def sampled_action_logp(self):
             axis=-1)
         return log_prob
 
-    @override(ActionDistribution)
-    def logp(self, x):
-        unsquashed_values = self._unsquash(x)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(value=unsquashed_values), axis=-1)
+    def _log_squash_grad(self, unsquashed_values):
         unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
-        log_prob -= tf.math.reduce_sum(
+        return tf.math.reduce_sum(
             tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER),
             axis=-1)
-        return log_prob
 
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
@@ -266,6 +271,7 @@ def _squash(self, raw_values):
             1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \
                self.low
 
+
     def _unsquash(self, values):
         return tf.math.atanh((values - self.low) /
                              (self.high - self.low) * 2.0 - 1.0)
@@ -278,20 +284,9 @@ class GaussianSquashedGaussian(_SquashedGaussianBase):
     `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
     """
     # Chosen to match the standard logistic variance, so that:
-    #   Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1))
+    #   Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1))
     _SCALE = 0.5 * 1.8137
 
-    @override(ActionDistribution)
-    def logp(self, x):
-        unsquashed_values = self._unsquash(x)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(value=unsquashed_values), axis=-1)
-        squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
-        log_prob -= tf.reduce_sum(
-            squash_dist.log_prob(value=unsquashed_values), axis=-1)
-        log_prob -= tf.log(self.high - self.low)
-        return log_prob
-        
     @override(ActionDistribution)
     def kl(self, other):
         # KL(self || other) is just the KL of the two unsquashed distributions.
@@ -320,6 +315,13 @@ def entropy(self):
                 (tf.square(std) + tf.square(mean)) /
                 (2.0 * tf.square(self._SCALE)) - 0.5))
 
+    def _log_squash_grad(self, unsquashed_values):
+        squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
+        log_grad = tf.reduce_sum(
+            squash_dist.log_prob(value=unsquashed_values), axis=-1)
+        log_grad += tf.log(self.high - self.low)
+        return log_grad
+
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
         # return exactly 1.0/-1.0, which would lead to +/-inf log-probs).

From c8e53ced9bccfc63d5934018562760e53d591be1 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Sat, 14 Mar 2020 13:16:08 +0000
Subject: [PATCH 05/12] Fix up the shapes returned by SG

---
 rllib/models/catalog.py           |  5 ++-
 rllib/models/tf/tf_action_dist.py | 59 ++++++++++++++++++++++++-------
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 49c7c8f62506..8ce1a5d4d97f 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -113,12 +113,15 @@ def _make_bounded_dist(action_space):
         high = np.ravel(action_space.high)
 
         for l, h in zip(low, high):
-            if np.isinf(l) and np.isinf(h):
+            if not np.isinf(l) and not np.isinf(h):
                 dist = partial(GaussianSquashedGaussian, low=l, high=h)
             else:
                 dist = DiagGaussian
             child_dists.append(dist)
 
+        if len(child_dists) == 1:
+            return dist, 2
+
         return partial(
             MultiActionDistribution,
             action_space=action_space,
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index ecc3b1ed6827..bdabf27efb06 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -191,6 +191,8 @@ def required_model_output_shape(action_space, model_config):
 
 
 class _SquashedGaussianBase(TFActionDistribution):
+    """A univariate gaussian distribution, squashed into bounded support."""
+
     def __init__(self, inputs, model, low=-1.0, high=1.0):
         """Parameterizes the distribution via `inputs`.
 
@@ -201,12 +203,14 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
                 (excluding this value).
         """
         assert tfp is not None
-        loc, log_scale = tf.split(inputs, 2, axis=-1)
+        loc, log_scale = inputs[:, 0], inputs[:, 1]
         # Clip `scale` values (coming from NN) to reasonable values.
         self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
                                         MAX_LOG_NN_OUTPUT)
         scale = tf.exp(self.log_std)
         self.distr = tfp.distributions.Normal(loc=loc, scale=scale)
+        assert len(self.distr.loc.shape) == 1
+        assert len(self.distr.scale.shape) == 1
         assert np.all(np.less(low, high))
         self.low = low
         self.high = high
@@ -215,26 +219,59 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
     @override(ActionDistribution)
     def deterministic_sample(self):
         mean = self.distr.mean()
-        return self._squash(mean)
+        assert len(mean.shape) == 1, "Shape should be batch dim only"
+        s = self._squash(mean)
+        assert len(s.shape) == 1
+        return s[:, None]
 
     @override(ActionDistribution)
     def logp(self, x):
-        unsquashed_values = self._unsquash(x)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(value=unsquashed_values), axis=-1)
+        assert len(x.shape) >= 2, "First dim batch, second dim variable"
+        unsquashed_values = self._unsquash(x[:, 0])
+        log_prob = self.distr.log_prob(value=unsquashed_values)
         return log_prob - self._log_squash_grad(unsquashed_values)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
-        return self._squash(self.distr.sample())
+        s = self._squash(self.distr.sample())
+        assert len(s.shape) == 1
+        return s[:, None]
 
-    def _squash(self, raw_values):
+    def _squash(self, unsquashed_values):
+        """Squash an array element-wise into the (high, low) range
+        
+        Arguments:
+            unsquashed_values: values to be squashed
+
+        Returns:
+            The squashed values.  The output shape is `unsquashed_values.shape`
+
+        """
         raise NotImplementedError
 
     def _unsquash(self, values):
+        """Unsquash an array element-wise from the (high, low) range
+        
+        Arguments:
+            squashed_values: values to be unsquashed
+
+        Returns:
+            The unsquashed values.  The output shape is `squashed_values.shape`
+
+        """
         raise NotImplementedError
 
     def _log_squash_grad(self, unsquashed_values):
+        """Log gradient of _squash with respect to its argument.
+
+        Arguments:
+            squashed_values:  Point at which to measure the gradient.
+
+        Returns:
+            The gradient at the given point.  The output shape is
+            `squashed_values.shape`.
+
+        """
         raise NotImplementedError
 
 
@@ -258,9 +295,7 @@ def sampled_action_logp(self):
 
     def _log_squash_grad(self, unsquashed_values):
         unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
-        return tf.math.reduce_sum(
-            tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER),
-            axis=-1)
+        return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER)
 
     def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
@@ -271,7 +306,6 @@ def _squash(self, raw_values):
             1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \
                self.low
 
-
     def _unsquash(self, values):
         return tf.math.atanh((values - self.low) /
                              (self.high - self.low) * 2.0 - 1.0)
@@ -317,8 +351,7 @@ def entropy(self):
 
     def _log_squash_grad(self, unsquashed_values):
         squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
-        log_grad = tf.reduce_sum(
-            squash_dist.log_prob(value=unsquashed_values), axis=-1)
+        log_grad = squash_dist.log_prob(value=unsquashed_values)
         log_grad += tf.log(self.high - self.low)
         return log_grad
 

From f4521f7905d59e057167bc37d815b6e48f38c6e9 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Sun, 15 Mar 2020 16:12:15 +0000
Subject: [PATCH 06/12] Reformatting according to scripts/format.sh

---
 rllib/models/catalog.py           |  4 ++--
 rllib/models/tf/tf_action_dist.py | 16 +++++-----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 8ce1a5d4d97f..910068e3ca23 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -172,8 +172,8 @@ def get_action_dist(action_space,
                     "using a custom action distribution, "
                     "using a Tuple action space, or the multi-agent API.")
             if dist_type is None:
-                any_bounded = np.any(action_space.bounded_below &
-                                     action_space.bounded_above)
+                any_bounded = np.any(
+                    action_space.bounded_below & action_space.bounded_above)
                 if framework != "tf":
                     return TorchDiagGaussian
                 elif np.any(action_space.bounded_below &
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index bdabf27efb06..fd597e135131 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -346,8 +346,8 @@ def entropy(self):
 
         return (tf.log(self.high - self.low) -
                 (tf.log(self._SCALE) - self.log_std +
-                (tf.square(std) + tf.square(mean)) /
-                (2.0 * tf.square(self._SCALE)) - 0.5))
+                 (tf.square(std) + tf.square(mean)) /
+                 (2.0 * tf.square(self._SCALE)) - 0.5))
 
     def _log_squash_grad(self, unsquashed_values):
         squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)
@@ -359,19 +359,13 @@ def _squash(self, raw_values):
         # Make sure raw_values are not too high/low (such that tanh would
         # return exactly 1.0/-1.0, which would lead to +/-inf log-probs).
 
-        values = tfp.bijectors.NormalCDF().forward(
-                raw_values / self._SCALE
-        )
-        return (tf.clip_by_value(values,
-                                 SMALL_NUMBER,
-                                 1.0 - SMALL_NUMBER) *
+        values = tfp.bijectors.NormalCDF().forward(raw_values / self._SCALE)
+        return (tf.clip_by_value(values, SMALL_NUMBER, 1.0 - SMALL_NUMBER) *
                 (self.high - self.low) + self.low)
 
     def _unsquash(self, values):
         return self._SCALE * tfp.bijectors.NormalCDF().inverse(
-            (values - self.low) / (self.high - self.low)
-        )
-
+            (values - self.low) / (self.high - self.low))
 
 
 class Deterministic(TFActionDistribution):

From b0c2323a1f87caf803c3582571f232ed7c3a37a3 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Tue, 14 Apr 2020 07:48:31 +0100
Subject: [PATCH 07/12] code review markup

---
 rllib/models/catalog.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 910068e3ca23..94d4a79be307 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -172,15 +172,13 @@ def get_action_dist(action_space,
                     "using a custom action distribution, "
                     "using a Tuple action space, or the multi-agent API.")
             if dist_type is None:
-                any_bounded = np.any(
-                    action_space.bounded_below & action_space.bounded_above)
                 if framework != "tf":
                     return TorchDiagGaussian
                 elif np.any(action_space.bounded_below &
                             action_space.bounded_above):
                     return ModelCatalog._make_bounded_dist(action_space)
                 else:
-                    dist = TorchDiagGaussian
+                    dist = DiagGaussian
             elif dist_type == "deterministic":
                 dist = Deterministic
         # Discrete Space -> Categorical.

From 0e161fc2920faf7c1eeba794a6a95bfbca359852 Mon Sep 17 00:00:00 2001
From: Matthew Earl <gitlab@matthewearl.com>
Date: Tue, 14 Apr 2020 11:19:19 +0100
Subject: [PATCH 08/12] Bound loc for numerical stability

---
 rllib/models/tf/tf_action_dist.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index fd597e135131..a810dd9f730a 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -207,6 +207,8 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
         # Clip `scale` values (coming from NN) to reasonable values.
         self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT,
                                         MAX_LOG_NN_OUTPUT)
+        # Clip loc too, for numerical stability reasons.
+        loc = tf.clip_by_value(loc, -3, 3)
         scale = tf.exp(self.log_std)
         self.distr = tfp.distributions.Normal(loc=loc, scale=scale)
         assert len(self.distr.loc.shape) == 1

From f226d2e3df2c711e22c9737a95f40b7a719da761 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Thu, 16 Apr 2020 20:10:24 +0100
Subject: [PATCH 09/12] Fix squashed gaussian unit test

---
 rllib/models/tests/test_distributions.py |  2 +-
 rllib/models/tf/tf_action_dist.py        | 28 +++++++++++++-----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index ebd3525acd62..9586b753275f 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -155,7 +155,7 @@ def test_squashed_gaussian(self):
             check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05)
 
             # NN output.
-            means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0],
+            means = np.array([[0.1, 0.2, 0.3, 0.4, 2.9],
                               [-0.1, -0.2, -0.3, -0.4, -1.0]])
             log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0],
                                  [0.7, -0.3, 0.4, -0.9, 2.0]])
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 32226b97c64e..1975f72788c3 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -256,7 +256,7 @@ def required_model_output_shape(action_space, model_config):
 
 
 class _SquashedGaussianBase(TFActionDistribution):
-    """A univariate gaussian distribution, squashed into bounded support."""
+    """A diagonal gaussian distribution, squashed into bounded support."""
 
     def __init__(self, inputs, model, low=-1.0, high=1.0):
         """Parameterizes the distribution via `inputs`.
@@ -268,16 +268,18 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
                 (excluding this value).
         """
         assert tfp is not None
-        loc, log_std = inputs[:, 0], inputs[:, 1]
+        mean, log_std = tf.split(inputs, 2, axis=-1)
+        self._num_vars = mean.shape[1]
+        assert log_std.shape[1] == self._num_vars
         # Clip `std` values (coming from NN) to reasonable values.
         self.log_std = tf.clip_by_value(log_std, MIN_LOG_NN_OUTPUT,
                                         MAX_LOG_NN_OUTPUT)
         # Clip loc too, for numerical stability reasons.
-        loc = tf.clip_by_value(loc, -3, 3)
+        mean = tf.clip_by_value(mean, -3, 3)
         std = tf.exp(self.log_std)
-        self.distr = tfp.distributions.Normal(loc=loc, scale=std)
-        assert len(self.distr.loc.shape) == 1
-        assert len(self.distr.scale.shape) == 1
+        self.distr = tfp.distributions.Normal(loc=mean, scale=std)
+        assert len(self.distr.loc.shape) == 2
+        assert len(self.distr.scale.shape) == 2
         assert np.all(np.less(low, high))
         self.low = low
         self.high = high
@@ -286,23 +288,23 @@ def __init__(self, inputs, model, low=-1.0, high=1.0):
     @override(ActionDistribution)
     def deterministic_sample(self):
         mean = self.distr.mean()
-        assert len(mean.shape) == 1, "Shape should be batch dim only"
+        assert len(mean.shape) == 2
         s = self._squash(mean)
-        assert len(s.shape) == 1
-        return s[:, None]
+        assert len(s.shape) == 2
+        return s
 
     @override(ActionDistribution)
     def logp(self, x):
         assert len(x.shape) >= 2, "First dim batch, second dim variable"
-        unsquashed_values = self._unsquash(x[:, 0])
+        unsquashed_values = self._unsquash(x)
         log_prob = self.distr.log_prob(value=unsquashed_values)
-        return log_prob - self._log_squash_grad(unsquashed_values)
+        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
         s = self._squash(self.distr.sample())
-        assert len(s.shape) == 1
-        return s[:, None]
+        assert len(s.shape) == 2
+        return s
 
     def _squash(self, unsquashed_values):
         """Squash an array element-wise into the (high, low) range

From 3e1d345347a022b78896a0a69f14edb192921811 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Thu, 16 Apr 2020 21:17:14 +0100
Subject: [PATCH 10/12] Fix gaussian squashed gaussian following the previous
 commit

---
 rllib/models/tf/tf_action_dist.py | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 1975f72788c3..0ecb364cf875 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -298,7 +298,7 @@ def logp(self, x):
         assert len(x.shape) >= 2, "First dim batch, second dim variable"
         unsquashed_values = self._unsquash(x)
         log_prob = self.distr.log_prob(value=unsquashed_values)
-        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1)
+        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
@@ -351,17 +351,6 @@ class SquashedGaussian(_SquashedGaussianBase):
     `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
     """
 
-    @override(TFActionDistribution)
-    def sampled_action_logp(self):
-        unsquashed_values = self._unsquash(self.sample_op)
-        log_prob = tf.reduce_sum(
-            self.distr.log_prob(unsquashed_values), axis=-1)
-        unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
-        log_prob -= tf.math.reduce_sum(
-            tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER),
-            axis=-1)
-        return log_prob
-
     def _log_squash_grad(self, unsquashed_values):
         unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
         return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER)
@@ -401,9 +390,9 @@ def kl(self, other):
         other_mean = other.distr.loc
         other_std = other.distr.scale
 
-        return (other.log_std - self.log_std +
-                (tf.square(std) + tf.square(mean - other_mean)) /
-                (2.0 * tf.square(other_std)) - 0.5)
+        return tf.reduce_sum((other.log_std - self.log_std +
+                             (tf.square(std) + tf.square(mean - other_mean)) /
+                             (2.0 * tf.square(other_std)) - 0.5), axis=1)
 
     def entropy(self):
         # Entropy is:
@@ -413,10 +402,10 @@ def entropy(self):
         mean = self.distr.loc
         std = self.distr.scale
 
-        return (tf.log(self.high - self.low) -
-                (tf.log(self._SCALE) - self.log_std +
-                 (tf.square(std) + tf.square(mean)) /
-                 (2.0 * tf.square(self._SCALE)) - 0.5))
+        return tf.reduce_sum(tf.log(self.high - self.low) -
+                             (tf.log(self._SCALE) - self.log_std +
+                              (tf.square(std) + tf.square(mean)) /
+                              (2.0 * tf.square(self._SCALE)) - 0.5), axis=1)
 
     def _log_squash_grad(self, unsquashed_values):
         squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE)

From 9c9b8bce10f2f04a25e1481f581f411569c00569 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Thu, 16 Apr 2020 22:36:50 +0100
Subject: [PATCH 11/12] add test for gaussian squashed gaussian

---
 rllib/models/tests/test_distributions.py | 31 +++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py
index 9586b753275f..605ab39b0de1 100644
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@@ -4,7 +4,7 @@
 import unittest
 
 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \
-    SquashedGaussian, GumbelSoftmax
+    GaussianSquashedGaussian, SquashedGaussian, GumbelSoftmax
 from ray.rllib.models.torch.torch_action_dist import TorchMultiCategorical, \
     TorchSquashedGaussian, TorchBeta
 from ray.rllib.utils import try_import_tf, try_import_torch
@@ -185,6 +185,35 @@ def test_squashed_gaussian(self):
                 outs = sess.run(outs)
             check(outs, log_prob, decimals=4)
 
+    def test_gaussian_squashed_gaussian(self):
+        for fw, sess in framework_iterator(frameworks="tf", session=True):
+            inputs1 = tf.constant([[-0.5, 0.2, np.log(0.1), np.log(0.5)],
+                                   [0.6, 0.8, np.log(0.7), np.log(0.8)],
+                                   [-10.0, 1.2, np.log(0.9), np.log(1.0)]])
+
+            inputs2 = tf.constant([[0.2, 0.3, np.log(0.2), np.log(0.4)],
+                                   [0.6, 0.8, np.log(0.7), np.log(0.8)],
+                                   [-11.0, 1.2, np.log(0.9), np.log(1.0)]])
+
+            gsg_dist1 = GaussianSquashedGaussian(inputs1, None)
+            gsg_dist2 = GaussianSquashedGaussian(inputs2, None)
+
+            # KL, entropy, and logp values have been verified empirically.
+            check(sess.run(gsg_dist1.kl(gsg_dist2)),
+                  np.array([6.532504, 0., 0.]))
+            check(sess.run(gsg_dist1.entropy()),
+                  np.array([-0.74827796, 0.7070056, -4.971432]))
+            x = tf.constant([[-0.3939393939393939]])
+            check(sess.run(gsg_dist1.logp(x)),
+                  np.array([0.736003, -3.1547096, -6.5595593]))
+
+            # This is just the squashed distribution means. Verified using
+            # _unsquash (which was itself verified as part of the logp test).
+            expected = np.array([[-0.41861248, 0.1745522],
+                                 [0.49179232, 0.62231755],
+                                 [-0.99906087, 0.81425166]])
+            check(sess.run(gsg_dist1.deterministic_sample()), expected)
+
     def test_beta(self):
         input_space = Box(-2.0, 1.0, shape=(200, 10))
         low, high = -1.0, 2.0

From 731afbd60b53e0758a2dde57e08ea738626dfe39 Mon Sep 17 00:00:00 2001
From: Matthew Earl <git@matthewearl.com>
Date: Fri, 17 Apr 2020 10:05:32 +0100
Subject: [PATCH 12/12] linter fixes

---
 rllib/models/catalog.py           | 3 ++-
 rllib/models/tf/tf_action_dist.py | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index 7b0ff999ed03..79b715b536c0 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -13,7 +13,8 @@
 from ray.rllib.models.tf.lstm_v1 import LSTM
 from ray.rllib.models.tf.modelv1_compat import make_v1_wrapper
 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \
-    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, GaussianSquashedGaussian
+    Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \
+    GaussianSquashedGaussian
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.models.tf.visionnet_v1 import VisionNetwork
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py
index 0ecb364cf875..9c819b26922d 100644
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@@ -298,7 +298,8 @@ def logp(self, x):
         assert len(x.shape) >= 2, "First dim batch, second dim variable"
         unsquashed_values = self._unsquash(x)
         log_prob = self.distr.log_prob(value=unsquashed_values)
-        return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1)
+        return tf.reduce_sum(log_prob -
+                             self._log_squash_grad(unsquashed_values), axis=1)
 
     @override(TFActionDistribution)
     def _build_sample_op(self):
@@ -308,7 +309,7 @@ def _build_sample_op(self):
 
     def _squash(self, unsquashed_values):
         """Squash an array element-wise into the (high, low) range
-        
+
         Arguments:
             unsquashed_values: values to be squashed
 
@@ -320,7 +321,7 @@ def _squash(self, unsquashed_values):
 
     def _unsquash(self, values):
         """Unsquash an array element-wise from the (high, low) range
-        
+
         Arguments:
             squashed_values: values to be unsquashed