From 8e63d3c1b5e5f4558594b325211fca1c170a22dd Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 09:29:47 +0000 Subject: [PATCH 01/12] Implement GaussianSquashedGaussian. Still buggy --- rllib/models/tf/tf_action_dist.py | 117 +++++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 19 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 2bac4f4bc52e..8fb4d9ce8df2 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -190,13 +190,7 @@ def required_model_output_shape(action_space, model_config): return np.prod(action_space.shape) * 2 -class SquashedGaussian(TFActionDistribution): - """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. - - The distribution will never return low or high exactly, but - `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. - """ - +class _SquashedGaussianBase(TFActionDistribution): def __init__(self, inputs, model, low=-1.0, high=1.0): """Parameterizes the distribution via `inputs`. @@ -209,15 +203,38 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): assert tfp is not None loc, log_scale = tf.split(inputs, 2, axis=-1) # Clip `scale` values (coming from NN) to reasonable values. - log_scale = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, - MAX_LOG_NN_OUTPUT) - scale = tf.exp(log_scale) + self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, + MAX_LOG_NN_OUTPUT) + scale = tf.exp(self.log_std) self.distr = tfp.distributions.Normal(loc=loc, scale=scale) assert np.all(np.less(low, high)) self.low = low self.high = high super().__init__(inputs, model) + @override(ActionDistribution) + def deterministic_sample(self): + mean = self.distr.mean() + return self._squash(mean) + + @override(TFActionDistribution) + def _build_sample_op(self): + return self._squash(self.distr.sample()) + + def _squash(self, raw_values): + raise NotImplementedError + + def _unsquash(self, values): + raise NotImplementedError + + +class SquashedGaussian(_SquashedGaussianBase): + """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. + + The distribution will never return low or high exactly, but + `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. + """ + @override(TFActionDistribution) def sampled_action_logp(self): unsquashed_values = self._unsquash(self.sample_op) @@ -229,15 +246,6 @@ def sampled_action_logp(self): axis=-1) return log_prob - @override(ActionDistribution) - def deterministic_sample(self): - mean = self.distr.mean() - return self._squash(mean) - - @override(TFActionDistribution) - def _build_sample_op(self): - return self._squash(self.distr.sample()) - @override(ActionDistribution) def logp(self, x): unsquashed_values = self._unsquash(x) @@ -263,6 +271,77 @@ def _unsquash(self, values): (self.high - self.low) * 2.0 - 1.0) +class GaussianSquashedGaussian(_SquashedGaussianBase): + """A gaussian CDF-squashed Gaussian distribution. + + The distribution will never return low or high exactly, but + `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. + """ + # Chosen to match the standard logistic variance, so that: + # Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1)) + _SCALE = 0.5 * 1.8137 + + @override(ActionDistribution) + def logp(self, x): + unsquashed_values = self._unsquash(x) + log_prob = tf.reduce_sum( + self.distr.log_prob(value=unsquashed_values), axis=-1) + u = (unsquashed_values - self.low) / (self.high - self.low) + dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) + log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1) + log_prob += tf.log(self.high - self.low) + return log_prob + + @override(ActionDistribution) + def kl(self, other): + # KL(self || other) is just the KL of the two unsquashed distributions. + assert isinstance(other, GaussianSquashedGaussian) + + mean = self.distr.mean() + std = self.distr.std() + + other_mean = other.distr.mean() + other_std = other.distr.std() + + return tf.reduce_sum( + other.log_std - self.log_std + + (tf.square(std) + tf.square(mean - other_mean)) / + (2.0 * tf.square(other_std)) - 0.5, + axis=1) + + def entropy(self): + # Entropy is: + # -KL(self.distr || N(0, _SCALE)) + log(high - low) + # where the latter distribution's CDF is used to do the squashing. + + mean = self.distr.mean() + std = self.distr.std() + + return tf.reduce_sum( + log(self.high - self.low) - + (tf.log(self._SCALE) - self.log_std + + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5)) + + def _squash(self, raw_values): + # Make sure raw_values are not too high/low (such that tanh would + # return exactly 1.0/-1.0, which would lead to +/-inf log-probs). + + values = tfp.bijectors.NormalCDF().forward( + raw_values / self._SCALE + ) + return (tf.clip_by_value(values, + SMALL_NUMBER, + 1.0 - SMALL_NUMBER) * + (self.high - self.low) + self.low) + + def _unsquash(self, values): + return self._SCALE * tfp.bijectors.NormalCDF().inverse( + (values - self.low) / (self.high - self.low) + ) + + + class Deterministic(TFActionDistribution): """Action distribution that returns the input values directly. From 005c52420230013db6de5d2f840c616e18e8be75 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 10:00:24 +0000 Subject: [PATCH 02/12] fix bug in gsg logp --- rllib/models/tf/tf_action_dist.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 8fb4d9ce8df2..81c1cb5a0769 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -286,12 +286,12 @@ def logp(self, x): unsquashed_values = self._unsquash(x) log_prob = tf.reduce_sum( self.distr.log_prob(value=unsquashed_values), axis=-1) - u = (unsquashed_values - self.low) / (self.high - self.low) - dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) - log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1) - log_prob += tf.log(self.high - self.low) + squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) + log_prob -= tf.reduce_sum( + squash_dist.log_prob(value=unsquashed_values), axis=-1) + log_prob -= tf.log(self.high - self.low) return log_prob - + @override(ActionDistribution) def kl(self, other): # KL(self || other) is just the KL of the two unsquashed distributions. From ba69bb7ceecb4d2ea19e6c1ca9870f3ce4ae0423 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 18:44:11 +0000 Subject: [PATCH 03/12] Fix bugs in KL and entropy methods --- rllib/models/tf/tf_action_dist.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 81c1cb5a0769..faad2a4093b8 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -297,31 +297,28 @@ def kl(self, other): # KL(self || other) is just the KL of the two unsquashed distributions. assert isinstance(other, GaussianSquashedGaussian) - mean = self.distr.mean() - std = self.distr.std() + mean = self.distr.loc + std = self.distr.scale - other_mean = other.distr.mean() - other_std = other.distr.std() + other_mean = other.distr.loc + other_std = other.distr.scale - return tf.reduce_sum( - other.log_std - self.log_std + - (tf.square(std) + tf.square(mean - other_mean)) / - (2.0 * tf.square(other_std)) - 0.5, - axis=1) + return (other.log_std - self.log_std + + (tf.square(std) + tf.square(mean - other_mean)) / + (2.0 * tf.square(other_std)) - 0.5) def entropy(self): # Entropy is: # -KL(self.distr || N(0, _SCALE)) + log(high - low) # where the latter distribution's CDF is used to do the squashing. - mean = self.distr.mean() - std = self.distr.std() + mean = self.distr.loc + std = self.distr.scale - return tf.reduce_sum( - log(self.high - self.low) - - (tf.log(self._SCALE) - self.log_std + - (tf.square(std) + tf.square(mean)) / - (2.0 * tf.square(self._SCALE)) - 0.5)) + return (tf.log(self.high - self.low) - + (tf.log(self._SCALE) - self.log_std + + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5)) def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would From 113fc4ff47a68e46b6409d4c43b61bbd3964e484 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 20:54:56 +0000 Subject: [PATCH 04/12] Initial attempt at integrating GSG into catalog Still some bugs to fix --- rllib/models/catalog.py | 34 +++++++++++++++++++++++--- rllib/models/tf/tf_action_dist.py | 40 ++++++++++++++++--------------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 4fd864fde3ae..49c7c8f62506 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -13,7 +13,8 @@ from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet_v2 import VisionNetwork as VisionNetV2 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \ - Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet + Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \ + GaussianSquashedGaussian from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork from ray.rllib.models.tf.lstm_v1 import LSTM @@ -104,6 +105,26 @@ class ModelCatalog: >>> action = dist.sample() """ + @staticmethod + def _make_bounded_dist(action_space): + child_dists = [] + + low = np.ravel(action_space.low) + high = np.ravel(action_space.high) + + for l, h in zip(low, high): + if np.isinf(l) and np.isinf(h): + dist = partial(GaussianSquashedGaussian, low=l, high=h) + else: + dist = DiagGaussian + child_dists.append(dist) + + return partial( + MultiActionDistribution, + action_space=action_space, + child_distributions=child_dists, + input_lens=[2] * len(child_dists)), 2 * len(child_dists) + @staticmethod @DeveloperAPI def get_action_dist(action_space, @@ -147,9 +168,16 @@ def get_action_dist(action_space, "Consider reshaping this into a single dimension, " "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") - # TODO(sven): Check for bounds and return SquashedNormal, etc.. if dist_type is None: - dist = DiagGaussian if framework == "tf" else TorchDiagGaussian + any_bounded = np.any(action_space.bounded_below & + action_space.bounded_above) + if framework != "tf": + return TorchDiagGaussian + elif np.any(action_space.bounded_below & + action_space.bounded_above): + return ModelCatalog._make_bounded_dist(action_space) + else: + dist = TorchDiagGaussian elif dist_type == "deterministic": dist = Deterministic # Discrete Space -> Categorical. diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index faad2a4093b8..ecc3b1ed6827 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -217,6 +217,13 @@ def deterministic_sample(self): mean = self.distr.mean() return self._squash(mean) + @override(ActionDistribution) + def logp(self, x): + unsquashed_values = self._unsquash(x) + log_prob = tf.reduce_sum( + self.distr.log_prob(value=unsquashed_values), axis=-1) + return log_prob - self._log_squash_grad(unsquashed_values) + @override(TFActionDistribution) def _build_sample_op(self): return self._squash(self.distr.sample()) @@ -227,6 +234,9 @@ def _squash(self, raw_values): def _unsquash(self, values): raise NotImplementedError + def _log_squash_grad(self, unsquashed_values): + raise NotImplementedError + class SquashedGaussian(_SquashedGaussianBase): """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. @@ -246,16 +256,11 @@ def sampled_action_logp(self): axis=-1) return log_prob - @override(ActionDistribution) - def logp(self, x): - unsquashed_values = self._unsquash(x) - log_prob = tf.reduce_sum( - self.distr.log_prob(value=unsquashed_values), axis=-1) + def _log_squash_grad(self, unsquashed_values): unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) - log_prob -= tf.math.reduce_sum( + return tf.math.reduce_sum( tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), axis=-1) - return log_prob def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would @@ -266,6 +271,7 @@ def _squash(self, raw_values): 1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \ self.low + def _unsquash(self, values): return tf.math.atanh((values - self.low) / (self.high - self.low) * 2.0 - 1.0) @@ -278,20 +284,9 @@ class GaussianSquashedGaussian(_SquashedGaussianBase): `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. """ # Chosen to match the standard logistic variance, so that: - # Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1)) + # Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1)) _SCALE = 0.5 * 1.8137 - @override(ActionDistribution) - def logp(self, x): - unsquashed_values = self._unsquash(x) - log_prob = tf.reduce_sum( - self.distr.log_prob(value=unsquashed_values), axis=-1) - squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) - log_prob -= tf.reduce_sum( - squash_dist.log_prob(value=unsquashed_values), axis=-1) - log_prob -= tf.log(self.high - self.low) - return log_prob - @override(ActionDistribution) def kl(self, other): # KL(self || other) is just the KL of the two unsquashed distributions. @@ -320,6 +315,13 @@ def entropy(self): (tf.square(std) + tf.square(mean)) / (2.0 * tf.square(self._SCALE)) - 0.5)) + def _log_squash_grad(self, unsquashed_values): + squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) + log_grad = tf.reduce_sum( + squash_dist.log_prob(value=unsquashed_values), axis=-1) + log_grad += tf.log(self.high - self.low) + return log_grad + def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would # return exactly 1.0/-1.0, which would lead to +/-inf log-probs). From c8e53ced9bccfc63d5934018562760e53d591be1 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Sat, 14 Mar 2020 13:16:08 +0000 Subject: [PATCH 05/12] Fix up the shapes returned by SG --- rllib/models/catalog.py | 5 ++- rllib/models/tf/tf_action_dist.py | 59 ++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 49c7c8f62506..8ce1a5d4d97f 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -113,12 +113,15 @@ def _make_bounded_dist(action_space): high = np.ravel(action_space.high) for l, h in zip(low, high): - if np.isinf(l) and np.isinf(h): + if not np.isinf(l) and not np.isinf(h): dist = partial(GaussianSquashedGaussian, low=l, high=h) else: dist = DiagGaussian child_dists.append(dist) + if len(child_dists) == 1: + return dist, 2 + return partial( MultiActionDistribution, action_space=action_space, diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index ecc3b1ed6827..bdabf27efb06 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -191,6 +191,8 @@ def required_model_output_shape(action_space, model_config): class _SquashedGaussianBase(TFActionDistribution): + """A univariate gaussian distribution, squashed into bounded support.""" + def __init__(self, inputs, model, low=-1.0, high=1.0): """Parameterizes the distribution via `inputs`. @@ -201,12 +203,14 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): (excluding this value). """ assert tfp is not None - loc, log_scale = tf.split(inputs, 2, axis=-1) + loc, log_scale = inputs[:, 0], inputs[:, 1] # Clip `scale` values (coming from NN) to reasonable values. self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) scale = tf.exp(self.log_std) self.distr = tfp.distributions.Normal(loc=loc, scale=scale) + assert len(self.distr.loc.shape) == 1 + assert len(self.distr.scale.shape) == 1 assert np.all(np.less(low, high)) self.low = low self.high = high @@ -215,26 +219,59 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): @override(ActionDistribution) def deterministic_sample(self): mean = self.distr.mean() - return self._squash(mean) + assert len(mean.shape) == 1, "Shape should be batch dim only" + s = self._squash(mean) + assert len(s.shape) == 1 + return s[:, None] @override(ActionDistribution) def logp(self, x): - unsquashed_values = self._unsquash(x) - log_prob = tf.reduce_sum( - self.distr.log_prob(value=unsquashed_values), axis=-1) + assert len(x.shape) >= 2, "First dim batch, second dim variable" + unsquashed_values = self._unsquash(x[:, 0]) + log_prob = self.distr.log_prob(value=unsquashed_values) return log_prob - self._log_squash_grad(unsquashed_values) @override(TFActionDistribution) def _build_sample_op(self): - return self._squash(self.distr.sample()) + s = self._squash(self.distr.sample()) + assert len(s.shape) == 1 + return s[:, None] - def _squash(self, raw_values): + def _squash(self, unsquashed_values): + """Squash an array element-wise into the (high, low) range + + Arguments: + unsquashed_values: values to be squashed + + Returns: + The squashed values. The output shape is `unsquashed_values.shape` + + """ raise NotImplementedError def _unsquash(self, values): + """Unsquash an array element-wise from the (high, low) range + + Arguments: + squashed_values: values to be unsquashed + + Returns: + The unsquashed values. The output shape is `squashed_values.shape` + + """ raise NotImplementedError def _log_squash_grad(self, unsquashed_values): + """Log gradient of _squash with respect to its argument. + + Arguments: + squashed_values: Point at which to measure the gradient. + + Returns: + The gradient at the given point. The output shape is + `squashed_values.shape`. + + """ raise NotImplementedError @@ -258,9 +295,7 @@ def sampled_action_logp(self): def _log_squash_grad(self, unsquashed_values): unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) - return tf.math.reduce_sum( - tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), - axis=-1) + return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER) def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would @@ -271,7 +306,6 @@ def _squash(self, raw_values): 1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \ self.low - def _unsquash(self, values): return tf.math.atanh((values - self.low) / (self.high - self.low) * 2.0 - 1.0) @@ -317,8 +351,7 @@ def entropy(self): def _log_squash_grad(self, unsquashed_values): squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) - log_grad = tf.reduce_sum( - squash_dist.log_prob(value=unsquashed_values), axis=-1) + log_grad = squash_dist.log_prob(value=unsquashed_values) log_grad += tf.log(self.high - self.low) return log_grad From f4521f7905d59e057167bc37d815b6e48f38c6e9 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Sun, 15 Mar 2020 16:12:15 +0000 Subject: [PATCH 06/12] Reformatting according to scripts/format.sh --- rllib/models/catalog.py | 4 ++-- rllib/models/tf/tf_action_dist.py | 16 +++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 8ce1a5d4d97f..910068e3ca23 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -172,8 +172,8 @@ def get_action_dist(action_space, "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") if dist_type is None: - any_bounded = np.any(action_space.bounded_below & - action_space.bounded_above) + any_bounded = np.any( + action_space.bounded_below & action_space.bounded_above) if framework != "tf": return TorchDiagGaussian elif np.any(action_space.bounded_below & diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index bdabf27efb06..fd597e135131 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -346,8 +346,8 @@ def entropy(self): return (tf.log(self.high - self.low) - (tf.log(self._SCALE) - self.log_std + - (tf.square(std) + tf.square(mean)) / - (2.0 * tf.square(self._SCALE)) - 0.5)) + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5)) def _log_squash_grad(self, unsquashed_values): squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) @@ -359,19 +359,13 @@ def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would # return exactly 1.0/-1.0, which would lead to +/-inf log-probs). - values = tfp.bijectors.NormalCDF().forward( - raw_values / self._SCALE - ) - return (tf.clip_by_value(values, - SMALL_NUMBER, - 1.0 - SMALL_NUMBER) * + values = tfp.bijectors.NormalCDF().forward(raw_values / self._SCALE) + return (tf.clip_by_value(values, SMALL_NUMBER, 1.0 - SMALL_NUMBER) * (self.high - self.low) + self.low) def _unsquash(self, values): return self._SCALE * tfp.bijectors.NormalCDF().inverse( - (values - self.low) / (self.high - self.low) - ) - + (values - self.low) / (self.high - self.low)) class Deterministic(TFActionDistribution): From b0c2323a1f87caf803c3582571f232ed7c3a37a3 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Tue, 14 Apr 2020 07:48:31 +0100 Subject: [PATCH 07/12] code review markup --- rllib/models/catalog.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 910068e3ca23..94d4a79be307 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -172,15 +172,13 @@ def get_action_dist(action_space, "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") if dist_type is None: - any_bounded = np.any( - action_space.bounded_below & action_space.bounded_above) if framework != "tf": return TorchDiagGaussian elif np.any(action_space.bounded_below & action_space.bounded_above): return ModelCatalog._make_bounded_dist(action_space) else: - dist = TorchDiagGaussian + dist = DiagGaussian elif dist_type == "deterministic": dist = Deterministic # Discrete Space -> Categorical. From 0e161fc2920faf7c1eeba794a6a95bfbca359852 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Tue, 14 Apr 2020 11:19:19 +0100 Subject: [PATCH 08/12] Bound loc for numerical stability --- rllib/models/tf/tf_action_dist.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index fd597e135131..a810dd9f730a 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -207,6 +207,8 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): # Clip `scale` values (coming from NN) to reasonable values. self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) + # Clip loc too, for numerical stability reasons. + loc = tf.clip_by_value(loc, -3, 3) scale = tf.exp(self.log_std) self.distr = tfp.distributions.Normal(loc=loc, scale=scale) assert len(self.distr.loc.shape) == 1 From f226d2e3df2c711e22c9737a95f40b7a719da761 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Thu, 16 Apr 2020 20:10:24 +0100 Subject: [PATCH 09/12] Fix squashed gaussian unit test --- rllib/models/tests/test_distributions.py | 2 +- rllib/models/tf/tf_action_dist.py | 28 +++++++++++++----------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index ebd3525acd62..9586b753275f 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -155,7 +155,7 @@ def test_squashed_gaussian(self): check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05) # NN output. - means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0], + means = np.array([[0.1, 0.2, 0.3, 0.4, 2.9], [-0.1, -0.2, -0.3, -0.4, -1.0]]) log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]]) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 32226b97c64e..1975f72788c3 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -256,7 +256,7 @@ def required_model_output_shape(action_space, model_config): class _SquashedGaussianBase(TFActionDistribution): - """A univariate gaussian distribution, squashed into bounded support.""" + """A diagonal gaussian distribution, squashed into bounded support.""" def __init__(self, inputs, model, low=-1.0, high=1.0): """Parameterizes the distribution via `inputs`. @@ -268,16 +268,18 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): (excluding this value). """ assert tfp is not None - loc, log_std = inputs[:, 0], inputs[:, 1] + mean, log_std = tf.split(inputs, 2, axis=-1) + self._num_vars = mean.shape[1] + assert log_std.shape[1] == self._num_vars # Clip `std` values (coming from NN) to reasonable values. self.log_std = tf.clip_by_value(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) # Clip loc too, for numerical stability reasons. - loc = tf.clip_by_value(loc, -3, 3) + mean = tf.clip_by_value(mean, -3, 3) std = tf.exp(self.log_std) - self.distr = tfp.distributions.Normal(loc=loc, scale=std) - assert len(self.distr.loc.shape) == 1 - assert len(self.distr.scale.shape) == 1 + self.distr = tfp.distributions.Normal(loc=mean, scale=std) + assert len(self.distr.loc.shape) == 2 + assert len(self.distr.scale.shape) == 2 assert np.all(np.less(low, high)) self.low = low self.high = high @@ -286,23 +288,23 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): @override(ActionDistribution) def deterministic_sample(self): mean = self.distr.mean() - assert len(mean.shape) == 1, "Shape should be batch dim only" + assert len(mean.shape) == 2 s = self._squash(mean) - assert len(s.shape) == 1 - return s[:, None] + assert len(s.shape) == 2 + return s @override(ActionDistribution) def logp(self, x): assert len(x.shape) >= 2, "First dim batch, second dim variable" - unsquashed_values = self._unsquash(x[:, 0]) + unsquashed_values = self._unsquash(x) log_prob = self.distr.log_prob(value=unsquashed_values) - return log_prob - self._log_squash_grad(unsquashed_values) + return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1) @override(TFActionDistribution) def _build_sample_op(self): s = self._squash(self.distr.sample()) - assert len(s.shape) == 1 - return s[:, None] + assert len(s.shape) == 2 + return s def _squash(self, unsquashed_values): """Squash an array element-wise into the (high, low) range From 3e1d345347a022b78896a0a69f14edb192921811 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Thu, 16 Apr 2020 21:17:14 +0100 Subject: [PATCH 10/12] Fix gaussian squashed gaussian following the previous commit --- rllib/models/tf/tf_action_dist.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 1975f72788c3..0ecb364cf875 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -298,7 +298,7 @@ def logp(self, x): assert len(x.shape) >= 2, "First dim batch, second dim variable" unsquashed_values = self._unsquash(x) log_prob = self.distr.log_prob(value=unsquashed_values) - return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1) + return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1) @override(TFActionDistribution) def _build_sample_op(self): @@ -351,17 +351,6 @@ class SquashedGaussian(_SquashedGaussianBase): `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. """ - @override(TFActionDistribution) - def sampled_action_logp(self): - unsquashed_values = self._unsquash(self.sample_op) - log_prob = tf.reduce_sum( - self.distr.log_prob(unsquashed_values), axis=-1) - unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) - log_prob -= tf.math.reduce_sum( - tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), - axis=-1) - return log_prob - def _log_squash_grad(self, unsquashed_values): unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER) @@ -401,9 +390,9 @@ def kl(self, other): other_mean = other.distr.loc other_std = other.distr.scale - return (other.log_std - self.log_std + - (tf.square(std) + tf.square(mean - other_mean)) / - (2.0 * tf.square(other_std)) - 0.5) + return tf.reduce_sum((other.log_std - self.log_std + + (tf.square(std) + tf.square(mean - other_mean)) / + (2.0 * tf.square(other_std)) - 0.5), axis=1) def entropy(self): # Entropy is: @@ -413,10 +402,10 @@ def entropy(self): mean = self.distr.loc std = self.distr.scale - return (tf.log(self.high - self.low) - - (tf.log(self._SCALE) - self.log_std + - (tf.square(std) + tf.square(mean)) / - (2.0 * tf.square(self._SCALE)) - 0.5)) + return tf.reduce_sum(tf.log(self.high - self.low) - + (tf.log(self._SCALE) - self.log_std + + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5), axis=1) def _log_squash_grad(self, unsquashed_values): squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) From 9c9b8bce10f2f04a25e1481f581f411569c00569 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Thu, 16 Apr 2020 22:36:50 +0100 Subject: [PATCH 11/12] add test for gaussian squashed gaussian --- rllib/models/tests/test_distributions.py | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index 9586b753275f..605ab39b0de1 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -4,7 +4,7 @@ import unittest from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \ - SquashedGaussian, GumbelSoftmax + GaussianSquashedGaussian, SquashedGaussian, GumbelSoftmax from ray.rllib.models.torch.torch_action_dist import TorchMultiCategorical, \ TorchSquashedGaussian, TorchBeta from ray.rllib.utils import try_import_tf, try_import_torch @@ -185,6 +185,35 @@ def test_squashed_gaussian(self): outs = sess.run(outs) check(outs, log_prob, decimals=4) + def test_gaussian_squashed_gaussian(self): + for fw, sess in framework_iterator(frameworks="tf", session=True): + inputs1 = tf.constant([[-0.5, 0.2, np.log(0.1), np.log(0.5)], + [0.6, 0.8, np.log(0.7), np.log(0.8)], + [-10.0, 1.2, np.log(0.9), np.log(1.0)]]) + + inputs2 = tf.constant([[0.2, 0.3, np.log(0.2), np.log(0.4)], + [0.6, 0.8, np.log(0.7), np.log(0.8)], + [-11.0, 1.2, np.log(0.9), np.log(1.0)]]) + + gsg_dist1 = GaussianSquashedGaussian(inputs1, None) + gsg_dist2 = GaussianSquashedGaussian(inputs2, None) + + # KL, entropy, and logp values have been verified empirically. + check(sess.run(gsg_dist1.kl(gsg_dist2)), + np.array([6.532504, 0., 0.])) + check(sess.run(gsg_dist1.entropy()), + np.array([-0.74827796, 0.7070056, -4.971432])) + x = tf.constant([[-0.3939393939393939]]) + check(sess.run(gsg_dist1.logp(x)), + np.array([0.736003, -3.1547096, -6.5595593])) + + # This is just the squashed distribution means. Verified using + # _unsquash (which was itself verified as part of the logp test). + expected = np.array([[-0.41861248, 0.1745522], + [0.49179232, 0.62231755], + [-0.99906087, 0.81425166]]) + check(sess.run(gsg_dist1.deterministic_sample()), expected) + def test_beta(self): input_space = Box(-2.0, 1.0, shape=(200, 10)) low, high = -1.0, 2.0 From 731afbd60b53e0758a2dde57e08ea738626dfe39 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 17 Apr 2020 10:05:32 +0100 Subject: [PATCH 12/12] linter fixes --- rllib/models/catalog.py | 3 ++- rllib/models/tf/tf_action_dist.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 7b0ff999ed03..79b715b536c0 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -13,7 +13,8 @@ from ray.rllib.models.tf.lstm_v1 import LSTM from ray.rllib.models.tf.modelv1_compat import make_v1_wrapper from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \ - Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, GaussianSquashedGaussian + Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \ + GaussianSquashedGaussian from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.tf.visionnet_v1 import VisionNetwork from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 0ecb364cf875..9c819b26922d 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -298,7 +298,8 @@ def logp(self, x): assert len(x.shape) >= 2, "First dim batch, second dim variable" unsquashed_values = self._unsquash(x) log_prob = self.distr.log_prob(value=unsquashed_values) - return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1) + return tf.reduce_sum(log_prob - + self._log_squash_grad(unsquashed_values), axis=1) @override(TFActionDistribution) def _build_sample_op(self): @@ -308,7 +309,7 @@ def _build_sample_op(self): def _squash(self, unsquashed_values): """Squash an array element-wise into the (high, low) range - + Arguments: unsquashed_values: values to be squashed @@ -320,7 +321,7 @@ def _squash(self, unsquashed_values): def _unsquash(self, values): """Unsquash an array element-wise from the (high, low) range - + Arguments: squashed_values: values to be unsquashed