From 8e63d3c1b5e5f4558594b325211fca1c170a22dd Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 09:29:47 +0000 Subject: [PATCH 01/21] Implement GaussianSquashedGaussian. Still buggy --- rllib/models/tf/tf_action_dist.py | 117 +++++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 19 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 2bac4f4bc52e..8fb4d9ce8df2 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -190,13 +190,7 @@ def required_model_output_shape(action_space, model_config): return np.prod(action_space.shape) * 2 -class SquashedGaussian(TFActionDistribution): - """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. - - The distribution will never return low or high exactly, but - `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. - """ - +class _SquashedGaussianBase(TFActionDistribution): def __init__(self, inputs, model, low=-1.0, high=1.0): """Parameterizes the distribution via `inputs`. @@ -209,15 +203,38 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): assert tfp is not None loc, log_scale = tf.split(inputs, 2, axis=-1) # Clip `scale` values (coming from NN) to reasonable values. - log_scale = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, - MAX_LOG_NN_OUTPUT) - scale = tf.exp(log_scale) + self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, + MAX_LOG_NN_OUTPUT) + scale = tf.exp(self.log_std) self.distr = tfp.distributions.Normal(loc=loc, scale=scale) assert np.all(np.less(low, high)) self.low = low self.high = high super().__init__(inputs, model) + @override(ActionDistribution) + def deterministic_sample(self): + mean = self.distr.mean() + return self._squash(mean) + + @override(TFActionDistribution) + def _build_sample_op(self): + return self._squash(self.distr.sample()) + + def _squash(self, raw_values): + raise NotImplementedError + + def _unsquash(self, values): + raise NotImplementedError + + +class SquashedGaussian(_SquashedGaussianBase): + """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. + + The distribution will never return low or high exactly, but + `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. + """ + @override(TFActionDistribution) def sampled_action_logp(self): unsquashed_values = self._unsquash(self.sample_op) @@ -229,15 +246,6 @@ def sampled_action_logp(self): axis=-1) return log_prob - @override(ActionDistribution) - def deterministic_sample(self): - mean = self.distr.mean() - return self._squash(mean) - - @override(TFActionDistribution) - def _build_sample_op(self): - return self._squash(self.distr.sample()) - @override(ActionDistribution) def logp(self, x): unsquashed_values = self._unsquash(x) @@ -263,6 +271,77 @@ def _unsquash(self, values): (self.high - self.low) * 2.0 - 1.0) +class GaussianSquashedGaussian(_SquashedGaussianBase): + """A gaussian CDF-squashed Gaussian distribution. + + The distribution will never return low or high exactly, but + `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. + """ + # Chosen to match the standard logistic variance, so that: + # Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1)) + _SCALE = 0.5 * 1.8137 + + @override(ActionDistribution) + def logp(self, x): + unsquashed_values = self._unsquash(x) + log_prob = tf.reduce_sum( + self.distr.log_prob(value=unsquashed_values), axis=-1) + u = (unsquashed_values - self.low) / (self.high - self.low) + dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) + log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1) + log_prob += tf.log(self.high - self.low) + return log_prob + + @override(ActionDistribution) + def kl(self, other): + # KL(self || other) is just the KL of the two unsquashed distributions. + assert isinstance(other, GaussianSquashedGaussian) + + mean = self.distr.mean() + std = self.distr.std() + + other_mean = other.distr.mean() + other_std = other.distr.std() + + return tf.reduce_sum( + other.log_std - self.log_std + + (tf.square(std) + tf.square(mean - other_mean)) / + (2.0 * tf.square(other_std)) - 0.5, + axis=1) + + def entropy(self): + # Entropy is: + # -KL(self.distr || N(0, _SCALE)) + log(high - low) + # where the latter distribution's CDF is used to do the squashing. + + mean = self.distr.mean() + std = self.distr.std() + + return tf.reduce_sum( + log(self.high - self.low) - + (tf.log(self._SCALE) - self.log_std + + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5)) + + def _squash(self, raw_values): + # Make sure raw_values are not too high/low (such that tanh would + # return exactly 1.0/-1.0, which would lead to +/-inf log-probs). + + values = tfp.bijectors.NormalCDF().forward( + raw_values / self._SCALE + ) + return (tf.clip_by_value(values, + SMALL_NUMBER, + 1.0 - SMALL_NUMBER) * + (self.high - self.low) + self.low) + + def _unsquash(self, values): + return self._SCALE * tfp.bijectors.NormalCDF().inverse( + (values - self.low) / (self.high - self.low) + ) + + + class Deterministic(TFActionDistribution): """Action distribution that returns the input values directly. From 005c52420230013db6de5d2f840c616e18e8be75 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 10:00:24 +0000 Subject: [PATCH 02/21] fix bug in gsg logp --- rllib/models/tf/tf_action_dist.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 8fb4d9ce8df2..81c1cb5a0769 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -286,12 +286,12 @@ def logp(self, x): unsquashed_values = self._unsquash(x) log_prob = tf.reduce_sum( self.distr.log_prob(value=unsquashed_values), axis=-1) - u = (unsquashed_values - self.low) / (self.high - self.low) - dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) - log_prob -= tf.math.reduce_sum(dist.log_prob(value=u), axis=-1) - log_prob += tf.log(self.high - self.low) + squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) + log_prob -= tf.reduce_sum( + squash_dist.log_prob(value=unsquashed_values), axis=-1) + log_prob -= tf.log(self.high - self.low) return log_prob - + @override(ActionDistribution) def kl(self, other): # KL(self || other) is just the KL of the two unsquashed distributions. From ba69bb7ceecb4d2ea19e6c1ca9870f3ce4ae0423 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 18:44:11 +0000 Subject: [PATCH 03/21] Fix bugs in KL and entropy methods --- rllib/models/tf/tf_action_dist.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 81c1cb5a0769..faad2a4093b8 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -297,31 +297,28 @@ def kl(self, other): # KL(self || other) is just the KL of the two unsquashed distributions. assert isinstance(other, GaussianSquashedGaussian) - mean = self.distr.mean() - std = self.distr.std() + mean = self.distr.loc + std = self.distr.scale - other_mean = other.distr.mean() - other_std = other.distr.std() + other_mean = other.distr.loc + other_std = other.distr.scale - return tf.reduce_sum( - other.log_std - self.log_std + - (tf.square(std) + tf.square(mean - other_mean)) / - (2.0 * tf.square(other_std)) - 0.5, - axis=1) + return (other.log_std - self.log_std + + (tf.square(std) + tf.square(mean - other_mean)) / + (2.0 * tf.square(other_std)) - 0.5) def entropy(self): # Entropy is: # -KL(self.distr || N(0, _SCALE)) + log(high - low) # where the latter distribution's CDF is used to do the squashing. - mean = self.distr.mean() - std = self.distr.std() + mean = self.distr.loc + std = self.distr.scale - return tf.reduce_sum( - log(self.high - self.low) - - (tf.log(self._SCALE) - self.log_std + - (tf.square(std) + tf.square(mean)) / - (2.0 * tf.square(self._SCALE)) - 0.5)) + return (tf.log(self.high - self.low) - + (tf.log(self._SCALE) - self.log_std + + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5)) def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would From 113fc4ff47a68e46b6409d4c43b61bbd3964e484 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 13 Mar 2020 20:54:56 +0000 Subject: [PATCH 04/21] Initial attempt at integrating GSG into catalog Still some bugs to fix --- rllib/models/catalog.py | 34 +++++++++++++++++++++++--- rllib/models/tf/tf_action_dist.py | 40 ++++++++++++++++--------------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 4fd864fde3ae..49c7c8f62506 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -13,7 +13,8 @@ from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet_v2 import VisionNetwork as VisionNetV2 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \ - Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet + Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \ + GaussianSquashedGaussian from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork from ray.rllib.models.tf.lstm_v1 import LSTM @@ -104,6 +105,26 @@ class ModelCatalog: >>> action = dist.sample() """ + @staticmethod + def _make_bounded_dist(action_space): + child_dists = [] + + low = np.ravel(action_space.low) + high = np.ravel(action_space.high) + + for l, h in zip(low, high): + if np.isinf(l) and np.isinf(h): + dist = partial(GaussianSquashedGaussian, low=l, high=h) + else: + dist = DiagGaussian + child_dists.append(dist) + + return partial( + MultiActionDistribution, + action_space=action_space, + child_distributions=child_dists, + input_lens=[2] * len(child_dists)), 2 * len(child_dists) + @staticmethod @DeveloperAPI def get_action_dist(action_space, @@ -147,9 +168,16 @@ def get_action_dist(action_space, "Consider reshaping this into a single dimension, " "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") - # TODO(sven): Check for bounds and return SquashedNormal, etc.. if dist_type is None: - dist = DiagGaussian if framework == "tf" else TorchDiagGaussian + any_bounded = np.any(action_space.bounded_below & + action_space.bounded_above) + if framework != "tf": + return TorchDiagGaussian + elif np.any(action_space.bounded_below & + action_space.bounded_above): + return ModelCatalog._make_bounded_dist(action_space) + else: + dist = TorchDiagGaussian elif dist_type == "deterministic": dist = Deterministic # Discrete Space -> Categorical. diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index faad2a4093b8..ecc3b1ed6827 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -217,6 +217,13 @@ def deterministic_sample(self): mean = self.distr.mean() return self._squash(mean) + @override(ActionDistribution) + def logp(self, x): + unsquashed_values = self._unsquash(x) + log_prob = tf.reduce_sum( + self.distr.log_prob(value=unsquashed_values), axis=-1) + return log_prob - self._log_squash_grad(unsquashed_values) + @override(TFActionDistribution) def _build_sample_op(self): return self._squash(self.distr.sample()) @@ -227,6 +234,9 @@ def _squash(self, raw_values): def _unsquash(self, values): raise NotImplementedError + def _log_squash_grad(self, unsquashed_values): + raise NotImplementedError + class SquashedGaussian(_SquashedGaussianBase): """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. @@ -246,16 +256,11 @@ def sampled_action_logp(self): axis=-1) return log_prob - @override(ActionDistribution) - def logp(self, x): - unsquashed_values = self._unsquash(x) - log_prob = tf.reduce_sum( - self.distr.log_prob(value=unsquashed_values), axis=-1) + def _log_squash_grad(self, unsquashed_values): unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) - log_prob -= tf.math.reduce_sum( + return tf.math.reduce_sum( tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), axis=-1) - return log_prob def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would @@ -266,6 +271,7 @@ def _squash(self, raw_values): 1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \ self.low + def _unsquash(self, values): return tf.math.atanh((values - self.low) / (self.high - self.low) * 2.0 - 1.0) @@ -278,20 +284,9 @@ class GaussianSquashedGaussian(_SquashedGaussianBase): `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. """ # Chosen to match the standard logistic variance, so that: - # Var(N(0, 0.5 * _SCALE)) = Var(Logistic(0, 1)) + # Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1)) _SCALE = 0.5 * 1.8137 - @override(ActionDistribution) - def logp(self, x): - unsquashed_values = self._unsquash(x) - log_prob = tf.reduce_sum( - self.distr.log_prob(value=unsquashed_values), axis=-1) - squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) - log_prob -= tf.reduce_sum( - squash_dist.log_prob(value=unsquashed_values), axis=-1) - log_prob -= tf.log(self.high - self.low) - return log_prob - @override(ActionDistribution) def kl(self, other): # KL(self || other) is just the KL of the two unsquashed distributions. @@ -320,6 +315,13 @@ def entropy(self): (tf.square(std) + tf.square(mean)) / (2.0 * tf.square(self._SCALE)) - 0.5)) + def _log_squash_grad(self, unsquashed_values): + squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) + log_grad = tf.reduce_sum( + squash_dist.log_prob(value=unsquashed_values), axis=-1) + log_grad += tf.log(self.high - self.low) + return log_grad + def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would # return exactly 1.0/-1.0, which would lead to +/-inf log-probs). From c8e53ced9bccfc63d5934018562760e53d591be1 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Sat, 14 Mar 2020 13:16:08 +0000 Subject: [PATCH 05/21] Fix up the shapes returned by SG --- rllib/models/catalog.py | 5 ++- rllib/models/tf/tf_action_dist.py | 59 ++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 49c7c8f62506..8ce1a5d4d97f 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -113,12 +113,15 @@ def _make_bounded_dist(action_space): high = np.ravel(action_space.high) for l, h in zip(low, high): - if np.isinf(l) and np.isinf(h): + if not np.isinf(l) and not np.isinf(h): dist = partial(GaussianSquashedGaussian, low=l, high=h) else: dist = DiagGaussian child_dists.append(dist) + if len(child_dists) == 1: + return dist, 2 + return partial( MultiActionDistribution, action_space=action_space, diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index ecc3b1ed6827..bdabf27efb06 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -191,6 +191,8 @@ def required_model_output_shape(action_space, model_config): class _SquashedGaussianBase(TFActionDistribution): + """A univariate gaussian distribution, squashed into bounded support.""" + def __init__(self, inputs, model, low=-1.0, high=1.0): """Parameterizes the distribution via `inputs`. @@ -201,12 +203,14 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): (excluding this value). """ assert tfp is not None - loc, log_scale = tf.split(inputs, 2, axis=-1) + loc, log_scale = inputs[:, 0], inputs[:, 1] # Clip `scale` values (coming from NN) to reasonable values. self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) scale = tf.exp(self.log_std) self.distr = tfp.distributions.Normal(loc=loc, scale=scale) + assert len(self.distr.loc.shape) == 1 + assert len(self.distr.scale.shape) == 1 assert np.all(np.less(low, high)) self.low = low self.high = high @@ -215,26 +219,59 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): @override(ActionDistribution) def deterministic_sample(self): mean = self.distr.mean() - return self._squash(mean) + assert len(mean.shape) == 1, "Shape should be batch dim only" + s = self._squash(mean) + assert len(s.shape) == 1 + return s[:, None] @override(ActionDistribution) def logp(self, x): - unsquashed_values = self._unsquash(x) - log_prob = tf.reduce_sum( - self.distr.log_prob(value=unsquashed_values), axis=-1) + assert len(x.shape) >= 2, "First dim batch, second dim variable" + unsquashed_values = self._unsquash(x[:, 0]) + log_prob = self.distr.log_prob(value=unsquashed_values) return log_prob - self._log_squash_grad(unsquashed_values) @override(TFActionDistribution) def _build_sample_op(self): - return self._squash(self.distr.sample()) + s = self._squash(self.distr.sample()) + assert len(s.shape) == 1 + return s[:, None] - def _squash(self, raw_values): + def _squash(self, unsquashed_values): + """Squash an array element-wise into the (high, low) range + + Arguments: + unsquashed_values: values to be squashed + + Returns: + The squashed values. The output shape is `unsquashed_values.shape` + + """ raise NotImplementedError def _unsquash(self, values): + """Unsquash an array element-wise from the (high, low) range + + Arguments: + squashed_values: values to be unsquashed + + Returns: + The unsquashed values. The output shape is `squashed_values.shape` + + """ raise NotImplementedError def _log_squash_grad(self, unsquashed_values): + """Log gradient of _squash with respect to its argument. + + Arguments: + squashed_values: Point at which to measure the gradient. + + Returns: + The gradient at the given point. The output shape is + `squashed_values.shape`. + + """ raise NotImplementedError @@ -258,9 +295,7 @@ def sampled_action_logp(self): def _log_squash_grad(self, unsquashed_values): unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) - return tf.math.reduce_sum( - tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), - axis=-1) + return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER) def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would @@ -271,7 +306,6 @@ def _squash(self, raw_values): 1.0 - SMALL_NUMBER) + 1.0) / 2.0 * (self.high - self.low) + \ self.low - def _unsquash(self, values): return tf.math.atanh((values - self.low) / (self.high - self.low) * 2.0 - 1.0) @@ -317,8 +351,7 @@ def entropy(self): def _log_squash_grad(self, unsquashed_values): squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) - log_grad = tf.reduce_sum( - squash_dist.log_prob(value=unsquashed_values), axis=-1) + log_grad = squash_dist.log_prob(value=unsquashed_values) log_grad += tf.log(self.high - self.low) return log_grad From f4521f7905d59e057167bc37d815b6e48f38c6e9 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Sun, 15 Mar 2020 16:12:15 +0000 Subject: [PATCH 06/21] Reformatting according to scripts/format.sh --- rllib/models/catalog.py | 4 ++-- rllib/models/tf/tf_action_dist.py | 16 +++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 8ce1a5d4d97f..910068e3ca23 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -172,8 +172,8 @@ def get_action_dist(action_space, "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") if dist_type is None: - any_bounded = np.any(action_space.bounded_below & - action_space.bounded_above) + any_bounded = np.any( + action_space.bounded_below & action_space.bounded_above) if framework != "tf": return TorchDiagGaussian elif np.any(action_space.bounded_below & diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index bdabf27efb06..fd597e135131 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -346,8 +346,8 @@ def entropy(self): return (tf.log(self.high - self.low) - (tf.log(self._SCALE) - self.log_std + - (tf.square(std) + tf.square(mean)) / - (2.0 * tf.square(self._SCALE)) - 0.5)) + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5)) def _log_squash_grad(self, unsquashed_values): squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) @@ -359,19 +359,13 @@ def _squash(self, raw_values): # Make sure raw_values are not too high/low (such that tanh would # return exactly 1.0/-1.0, which would lead to +/-inf log-probs). - values = tfp.bijectors.NormalCDF().forward( - raw_values / self._SCALE - ) - return (tf.clip_by_value(values, - SMALL_NUMBER, - 1.0 - SMALL_NUMBER) * + values = tfp.bijectors.NormalCDF().forward(raw_values / self._SCALE) + return (tf.clip_by_value(values, SMALL_NUMBER, 1.0 - SMALL_NUMBER) * (self.high - self.low) + self.low) def _unsquash(self, values): return self._SCALE * tfp.bijectors.NormalCDF().inverse( - (values - self.low) / (self.high - self.low) - ) - + (values - self.low) / (self.high - self.low)) class Deterministic(TFActionDistribution): From b0c2323a1f87caf803c3582571f232ed7c3a37a3 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Tue, 14 Apr 2020 07:48:31 +0100 Subject: [PATCH 07/21] code review markup --- rllib/models/catalog.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 910068e3ca23..94d4a79be307 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -172,15 +172,13 @@ def get_action_dist(action_space, "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") if dist_type is None: - any_bounded = np.any( - action_space.bounded_below & action_space.bounded_above) if framework != "tf": return TorchDiagGaussian elif np.any(action_space.bounded_below & action_space.bounded_above): return ModelCatalog._make_bounded_dist(action_space) else: - dist = TorchDiagGaussian + dist = DiagGaussian elif dist_type == "deterministic": dist = Deterministic # Discrete Space -> Categorical. From 0e161fc2920faf7c1eeba794a6a95bfbca359852 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Tue, 14 Apr 2020 11:19:19 +0100 Subject: [PATCH 08/21] Bound loc for numerical stability --- rllib/models/tf/tf_action_dist.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index fd597e135131..a810dd9f730a 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -207,6 +207,8 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): # Clip `scale` values (coming from NN) to reasonable values. self.log_std = tf.clip_by_value(log_scale, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) + # Clip loc too, for numerical stability reasons. + loc = tf.clip_by_value(loc, -3, 3) scale = tf.exp(self.log_std) self.distr = tfp.distributions.Normal(loc=loc, scale=scale) assert len(self.distr.loc.shape) == 1 From f226d2e3df2c711e22c9737a95f40b7a719da761 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Thu, 16 Apr 2020 20:10:24 +0100 Subject: [PATCH 09/21] Fix squashed gaussian unit test --- rllib/models/tests/test_distributions.py | 2 +- rllib/models/tf/tf_action_dist.py | 28 +++++++++++++----------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index ebd3525acd62..9586b753275f 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -155,7 +155,7 @@ def test_squashed_gaussian(self): check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05) # NN output. - means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0], + means = np.array([[0.1, 0.2, 0.3, 0.4, 2.9], [-0.1, -0.2, -0.3, -0.4, -1.0]]) log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]]) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 32226b97c64e..1975f72788c3 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -256,7 +256,7 @@ def required_model_output_shape(action_space, model_config): class _SquashedGaussianBase(TFActionDistribution): - """A univariate gaussian distribution, squashed into bounded support.""" + """A diagonal gaussian distribution, squashed into bounded support.""" def __init__(self, inputs, model, low=-1.0, high=1.0): """Parameterizes the distribution via `inputs`. @@ -268,16 +268,18 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): (excluding this value). """ assert tfp is not None - loc, log_std = inputs[:, 0], inputs[:, 1] + mean, log_std = tf.split(inputs, 2, axis=-1) + self._num_vars = mean.shape[1] + assert log_std.shape[1] == self._num_vars # Clip `std` values (coming from NN) to reasonable values. self.log_std = tf.clip_by_value(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) # Clip loc too, for numerical stability reasons. - loc = tf.clip_by_value(loc, -3, 3) + mean = tf.clip_by_value(mean, -3, 3) std = tf.exp(self.log_std) - self.distr = tfp.distributions.Normal(loc=loc, scale=std) - assert len(self.distr.loc.shape) == 1 - assert len(self.distr.scale.shape) == 1 + self.distr = tfp.distributions.Normal(loc=mean, scale=std) + assert len(self.distr.loc.shape) == 2 + assert len(self.distr.scale.shape) == 2 assert np.all(np.less(low, high)) self.low = low self.high = high @@ -286,23 +288,23 @@ def __init__(self, inputs, model, low=-1.0, high=1.0): @override(ActionDistribution) def deterministic_sample(self): mean = self.distr.mean() - assert len(mean.shape) == 1, "Shape should be batch dim only" + assert len(mean.shape) == 2 s = self._squash(mean) - assert len(s.shape) == 1 - return s[:, None] + assert len(s.shape) == 2 + return s @override(ActionDistribution) def logp(self, x): assert len(x.shape) >= 2, "First dim batch, second dim variable" - unsquashed_values = self._unsquash(x[:, 0]) + unsquashed_values = self._unsquash(x) log_prob = self.distr.log_prob(value=unsquashed_values) - return log_prob - self._log_squash_grad(unsquashed_values) + return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1) @override(TFActionDistribution) def _build_sample_op(self): s = self._squash(self.distr.sample()) - assert len(s.shape) == 1 - return s[:, None] + assert len(s.shape) == 2 + return s def _squash(self, unsquashed_values): """Squash an array element-wise into the (high, low) range From 3e1d345347a022b78896a0a69f14edb192921811 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Thu, 16 Apr 2020 21:17:14 +0100 Subject: [PATCH 10/21] Fix gaussian squashed gaussian following the previous commit --- rllib/models/tf/tf_action_dist.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 1975f72788c3..0ecb364cf875 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -298,7 +298,7 @@ def logp(self, x): assert len(x.shape) >= 2, "First dim batch, second dim variable" unsquashed_values = self._unsquash(x) log_prob = self.distr.log_prob(value=unsquashed_values) - return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=-1) + return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1) @override(TFActionDistribution) def _build_sample_op(self): @@ -351,17 +351,6 @@ class SquashedGaussian(_SquashedGaussianBase): `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. """ - @override(TFActionDistribution) - def sampled_action_logp(self): - unsquashed_values = self._unsquash(self.sample_op) - log_prob = tf.reduce_sum( - self.distr.log_prob(unsquashed_values), axis=-1) - unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) - log_prob -= tf.math.reduce_sum( - tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), - axis=-1) - return log_prob - def _log_squash_grad(self, unsquashed_values): unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER) @@ -401,9 +390,9 @@ def kl(self, other): other_mean = other.distr.loc other_std = other.distr.scale - return (other.log_std - self.log_std + - (tf.square(std) + tf.square(mean - other_mean)) / - (2.0 * tf.square(other_std)) - 0.5) + return tf.reduce_sum((other.log_std - self.log_std + + (tf.square(std) + tf.square(mean - other_mean)) / + (2.0 * tf.square(other_std)) - 0.5), axis=1) def entropy(self): # Entropy is: @@ -413,10 +402,10 @@ def entropy(self): mean = self.distr.loc std = self.distr.scale - return (tf.log(self.high - self.low) - - (tf.log(self._SCALE) - self.log_std + - (tf.square(std) + tf.square(mean)) / - (2.0 * tf.square(self._SCALE)) - 0.5)) + return tf.reduce_sum(tf.log(self.high - self.low) - + (tf.log(self._SCALE) - self.log_std + + (tf.square(std) + tf.square(mean)) / + (2.0 * tf.square(self._SCALE)) - 0.5), axis=1) def _log_squash_grad(self, unsquashed_values): squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) From 9c9b8bce10f2f04a25e1481f581f411569c00569 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Thu, 16 Apr 2020 22:36:50 +0100 Subject: [PATCH 11/21] add test for gaussian squashed gaussian --- rllib/models/tests/test_distributions.py | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index 9586b753275f..605ab39b0de1 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -4,7 +4,7 @@ import unittest from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \ - SquashedGaussian, GumbelSoftmax + GaussianSquashedGaussian, SquashedGaussian, GumbelSoftmax from ray.rllib.models.torch.torch_action_dist import TorchMultiCategorical, \ TorchSquashedGaussian, TorchBeta from ray.rllib.utils import try_import_tf, try_import_torch @@ -185,6 +185,35 @@ def test_squashed_gaussian(self): outs = sess.run(outs) check(outs, log_prob, decimals=4) + def test_gaussian_squashed_gaussian(self): + for fw, sess in framework_iterator(frameworks="tf", session=True): + inputs1 = tf.constant([[-0.5, 0.2, np.log(0.1), np.log(0.5)], + [0.6, 0.8, np.log(0.7), np.log(0.8)], + [-10.0, 1.2, np.log(0.9), np.log(1.0)]]) + + inputs2 = tf.constant([[0.2, 0.3, np.log(0.2), np.log(0.4)], + [0.6, 0.8, np.log(0.7), np.log(0.8)], + [-11.0, 1.2, np.log(0.9), np.log(1.0)]]) + + gsg_dist1 = GaussianSquashedGaussian(inputs1, None) + gsg_dist2 = GaussianSquashedGaussian(inputs2, None) + + # KL, entropy, and logp values have been verified empirically. + check(sess.run(gsg_dist1.kl(gsg_dist2)), + np.array([6.532504, 0., 0.])) + check(sess.run(gsg_dist1.entropy()), + np.array([-0.74827796, 0.7070056, -4.971432])) + x = tf.constant([[-0.3939393939393939]]) + check(sess.run(gsg_dist1.logp(x)), + np.array([0.736003, -3.1547096, -6.5595593])) + + # This is just the squashed distribution means. Verified using + # _unsquash (which was itself verified as part of the logp test). + expected = np.array([[-0.41861248, 0.1745522], + [0.49179232, 0.62231755], + [-0.99906087, 0.81425166]]) + check(sess.run(gsg_dist1.deterministic_sample()), expected) + def test_beta(self): input_space = Box(-2.0, 1.0, shape=(200, 10)) low, high = -1.0, 2.0 From 731afbd60b53e0758a2dde57e08ea738626dfe39 Mon Sep 17 00:00:00 2001 From: Matthew Earl Date: Fri, 17 Apr 2020 10:05:32 +0100 Subject: [PATCH 12/21] linter fixes --- rllib/models/catalog.py | 3 ++- rllib/models/tf/tf_action_dist.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 7b0ff999ed03..79b715b536c0 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -13,7 +13,8 @@ from ray.rllib.models.tf.lstm_v1 import LSTM from ray.rllib.models.tf.modelv1_compat import make_v1_wrapper from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical, \ - Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, GaussianSquashedGaussian + Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet, \ + GaussianSquashedGaussian from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.tf.visionnet_v1 import VisionNetwork from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 0ecb364cf875..9c819b26922d 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -298,7 +298,8 @@ def logp(self, x): assert len(x.shape) >= 2, "First dim batch, second dim variable" unsquashed_values = self._unsquash(x) log_prob = self.distr.log_prob(value=unsquashed_values) - return tf.reduce_sum(log_prob - self._log_squash_grad(unsquashed_values), axis=1) + return tf.reduce_sum(log_prob - + self._log_squash_grad(unsquashed_values), axis=1) @override(TFActionDistribution) def _build_sample_op(self): @@ -308,7 +309,7 @@ def _build_sample_op(self): def _squash(self, unsquashed_values): """Squash an array element-wise into the (high, low) range - + Arguments: unsquashed_values: values to be squashed @@ -320,7 +321,7 @@ def _squash(self, unsquashed_values): def _unsquash(self, values): """Unsquash an array element-wise from the (high, low) range - + Arguments: squashed_values: values to be unsquashed From 7e89931e284e70db00b1942a5f7ed529430d6a39 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 11 Jan 2021 22:58:26 +0100 Subject: [PATCH 13/21] WIP. --- rllib/models/catalog.py | 4 +-- rllib/models/tests/test_distributions.py | 19 ++++++++------ rllib/models/tf/tf_action_dist.py | 33 ++++++++++++++++-------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 6c9fa2ffa06c..eeb737bfdef8 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -218,9 +218,7 @@ def get_action_dist( action_space.bounded_above): return ModelCatalog._make_bounded_dist(action_space) else: - dist = DiagGaussian - #dist_cls = TorchDiagGaussian if framework == "torch" \ - # else DiagGaussian + dist_cls = DiagGaussian elif dist_type == "deterministic": dist_cls = TorchDeterministic if framework == "torch" \ else Deterministic diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index 0a7efac07ec5..ec9186fd8b86 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -383,18 +383,21 @@ def test_gaussian_squashed_gaussian(self): gsg_dist2 = GaussianSquashedGaussian(inputs2, None) # KL, entropy, and logp values have been verified empirically. - check(sess.run(gsg_dist1.kl(gsg_dist2)), - np.array([6.532504, 0., 0.])) - check(sess.run(gsg_dist1.entropy()), - np.array([-0.74827796, 0.7070056, -4.971432])) + check( + sess.run(gsg_dist1.kl(gsg_dist2)), np.array([6.532504, 0., + 0.])) + check( + sess.run(gsg_dist1.entropy()), + np.array([-0.74827796, 0.7070056, -4.971432])) x = tf.constant([[-0.3939393939393939]]) - check(sess.run(gsg_dist1.logp(x)), - np.array([0.736003, -3.1547096, -6.5595593])) + check( + sess.run(gsg_dist1.logp(x)), + np.array([0.736003, -3.1547096, -6.5595593])) # This is just the squashed distribution means. Verified using # _unsquash (which was itself verified as part of the logp test). - expected = np.array([[-0.41861248, 0.1745522], - [0.49179232, 0.62231755], + expected = np.array([[-0.41861248, + 0.1745522], [0.49179232, 0.62231755], [-0.99906087, 0.81425166]]) check(sess.run(gsg_dist1.deterministic_sample()), expected) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 37372c9af907..eb796c697ef4 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -329,10 +329,10 @@ def logp(self, x: TensorType) -> TensorType: log_prob_gaussian = self.distr.log_prob(unsquashed_values) # For safety reasons, clamp somehow, only then sum up. log_prob_gaussian = tf.clip_by_value(log_prob_gaussian, -100, 100) - log_prob_gaussian = tf.reduce_sum(log_prob_gaussian, axis=-1) # Get log-prob for squashed Gaussian. - return tf.reduce_sum(log_prob_gaussian - - self._log_squash_grad(unsquashed_values), axis=1) + return tf.reduce_sum( + log_prob_gaussian - self._log_squash_grad(unsquashed_values), + axis=-1) @override(TFActionDistribution) def _build_sample_op(self): @@ -497,9 +497,11 @@ def kl(self, other): other_mean = other.distr.loc other_std = other.distr.scale - return tf.reduce_sum((other.log_std - self.log_std + - (tf.square(std) + tf.square(mean - other_mean)) / - (2.0 * tf.square(other_std)) - 0.5), axis=1) + return tf.reduce_sum( + (other.log_std - self.log_std + + (tf.math.square(std) + tf.math.square(mean - other_mean)) / + (2.0 * tf.math.square(other_std)) - 0.5), + axis=1) def entropy(self): # Entropy is: @@ -509,15 +511,17 @@ def entropy(self): mean = self.distr.loc std = self.distr.scale - return tf.reduce_sum(tf.log(self.high - self.low) - - (tf.log(self._SCALE) - self.log_std + - (tf.square(std) + tf.square(mean)) / - (2.0 * tf.square(self._SCALE)) - 0.5), axis=1) + return tf.reduce_sum( + tf.math.log(self.high - self.low) - + (tf.math.log(self._SCALE) - self.log_std + + (tf.math.square(std) + tf.math.square(mean)) / + (2.0 * tf.math.square(self._SCALE)) - 0.5), + axis=1) def _log_squash_grad(self, unsquashed_values): squash_dist = tfp.distributions.Normal(loc=0, scale=self._SCALE) log_grad = squash_dist.log_prob(value=unsquashed_values) - log_grad += tf.log(self.high - self.low) + log_grad += tf.math.log(self.high - self.low) return log_grad def _squash(self, raw_values): @@ -532,6 +536,13 @@ def _unsquash(self, values): return self._SCALE * tfp.bijectors.NormalCDF().inverse( (values - self.low) / (self.high - self.low)) + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, + model_config: ModelConfigDict) -> Union[int, np.ndarray]: + return np.prod(action_space.shape) * 2 + class Deterministic(TFActionDistribution): """Action distribution that returns the input values directly. From 921843003ff09c4b0993b83ba96012a907ceff9f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 11 Jan 2021 23:32:10 +0100 Subject: [PATCH 14/21] LINT. --- rllib/models/tests/test_distributions.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index ec9186fd8b86..8061ed7a9bbb 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -371,13 +371,15 @@ def test_diag_gaussian(self): def test_gaussian_squashed_gaussian(self): for fw, sess in framework_iterator(frameworks="tf", session=True): - inputs1 = tf.constant([[-0.5, 0.2, np.log(0.1), np.log(0.5)], - [0.6, 0.8, np.log(0.7), np.log(0.8)], - [-10.0, 1.2, np.log(0.9), np.log(1.0)]]) - - inputs2 = tf.constant([[0.2, 0.3, np.log(0.2), np.log(0.4)], - [0.6, 0.8, np.log(0.7), np.log(0.8)], - [-11.0, 1.2, np.log(0.9), np.log(1.0)]]) + inputs1 = tf.constant([ + [-0.5, 0.2, np.log(0.1), np.log(0.5)], + [0.6, 0.8, np.log(0.7), np.log(0.8)], + [-10.0, 1.2, np.log(0.9), np.log(1.0)]]) + + inputs2 = tf.constant([ + [0.2, 0.3, np.log(0.2), np.log(0.4)], + [0.6, 0.8, np.log(0.7), np.log(0.8)], + [-11.0, 1.2, np.log(0.9), np.log(1.0)]]) gsg_dist1 = GaussianSquashedGaussian(inputs1, None) gsg_dist2 = GaussianSquashedGaussian(inputs2, None) From ed7d261274780f7836e7419d2dc3a9d83cbfa7b0 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 12 Jan 2021 12:28:07 +0100 Subject: [PATCH 15/21] Fix. --- rllib/models/catalog.py | 2 +- rllib/models/tests/test_distributions.py | 28 ++++++++++++++++-------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index eeb737bfdef8..317292ec6103 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -213,7 +213,7 @@ def get_action_dist( "using a Tuple action space, or the multi-agent API.") if dist_type is None: if framework == "torch": - return TorchDiagGaussian + dist_cls = TorchDiagGaussian elif np.any(action_space.bounded_below & action_space.bounded_above): return ModelCatalog._make_bounded_dist(action_space) diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index 8061ed7a9bbb..32814976303e 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -371,15 +371,25 @@ def test_diag_gaussian(self): def test_gaussian_squashed_gaussian(self): for fw, sess in framework_iterator(frameworks="tf", session=True): - inputs1 = tf.constant([ - [-0.5, 0.2, np.log(0.1), np.log(0.5)], - [0.6, 0.8, np.log(0.7), np.log(0.8)], - [-10.0, 1.2, np.log(0.9), np.log(1.0)]]) - - inputs2 = tf.constant([ - [0.2, 0.3, np.log(0.2), np.log(0.4)], - [0.6, 0.8, np.log(0.7), np.log(0.8)], - [-11.0, 1.2, np.log(0.9), np.log(1.0)]]) + inputs1 = tf.constant([[-0.5, 0.2, + np.log(0.1), + np.log(0.5)], + [0.6, 0.8, + np.log(0.7), + np.log(0.8)], + [-10.0, 1.2, + np.log(0.9), + np.log(1.0)]]) + + inputs2 = tf.constant([[0.2, 0.3, + np.log(0.2), + np.log(0.4)], + [0.6, 0.8, + np.log(0.7), + np.log(0.8)], + [-11.0, 1.2, + np.log(0.9), + np.log(1.0)]]) gsg_dist1 = GaussianSquashedGaussian(inputs1, None) gsg_dist2 = GaussianSquashedGaussian(inputs2, None) From 6098ddaf26e78042bb8cca89e0e8a94fe5709ae6 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 12 Jan 2021 19:52:45 +0100 Subject: [PATCH 16/21] Torch version and LINT. --- rllib/models/catalog.py | 54 +++--- rllib/models/tests/test_distributions.py | 72 ++++---- rllib/models/tf/tf_action_dist.py | 140 +++++++++------- rllib/models/torch/torch_action_dist.py | 201 +++++++++++++++++++---- 4 files changed, 308 insertions(+), 159 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 317292ec6103..2be6446a55d1 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -16,7 +16,7 @@ GaussianSquashedGaussian, \ MultiActionDistribution, MultiCategorical from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \ - TorchDeterministic, TorchDiagGaussian, \ + TorchDeterministic, TorchDiagGaussian, TorchGaussianSquashedGaussian, \ TorchMultiActionDistribution, TorchMultiCategorical from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI from ray.rllib.utils.deprecation import DEPRECATED_VALUE @@ -211,14 +211,31 @@ def get_action_dist( "Consider reshaping this into a single dimension, " "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") + if dist_type is None: - if framework == "torch": - dist_cls = TorchDiagGaussian - elif np.any(action_space.bounded_below & - action_space.bounded_above): - return ModelCatalog._make_bounded_dist(action_space) + cls = TorchGaussianSquashedGaussian if framework == "torch" \ + else GaussianSquashedGaussian + if np.any(action_space.bounded_below & + action_space.bounded_above): + if any(action_space.low != action_space.low[0]) or \ + any(action_space.high != action_space.high[0]): + raise UnsupportedSpaceException( + "The Box space has non-matching low/high value(s)." + " Make sure that all low/high values are the same " + "accross the different dimensions of your Box. If " + "the different dimensions must have different " + "low/high values, try splitting up your space into" + " a Tuple or Dict space.") + dist_cls = partial( + cls, + low=action_space.low[0], + high=action_space.high[0]) + num_inputs = cls.required_model_output_shape( + action_space, config) + return dist_cls, num_inputs else: - dist_cls = DiagGaussian + dist_cls = TorchDiagGaussian if framework == "torch" else \ + DiagGaussian elif dist_type == "deterministic": dist_cls = TorchDeterministic if framework == "torch" \ else Deterministic @@ -730,29 +747,6 @@ def _get_multi_action_distribution(dist_class, action_space, config, input_lens=input_lens), int(sum(input_lens)) return dist_class - @staticmethod - def _make_bounded_dist(action_space): - child_dists = [] - - low = np.ravel(action_space.low) - high = np.ravel(action_space.high) - - for l, h in zip(low, high): - if not np.isinf(l) and not np.isinf(h): - dist = partial(GaussianSquashedGaussian, low=l, high=h) - else: - dist = DiagGaussian - child_dists.append(dist) - - if len(child_dists) == 1: - return dist, 2 - - return partial( - MultiActionDistribution, - action_space=action_space, - child_distributions=child_dists, - input_lens=[2] * len(child_dists)), 2 * len(child_dists) - @staticmethod def _validate_config(config: ModelConfigDict, framework: str) -> None: """Validates a given model config dict. diff --git a/rllib/models/tests/test_distributions.py b/rllib/models/tests/test_distributions.py index 32814976303e..cac34560c589 100644 --- a/rllib/models/tests/test_distributions.py +++ b/rllib/models/tests/test_distributions.py @@ -10,8 +10,8 @@ DiagGaussian, GaussianSquashedGaussian, GumbelSoftmax, \ MultiActionDistribution, MultiCategorical, SquashedGaussian from ray.rllib.models.torch.torch_action_dist import TorchBeta, \ - TorchCategorical, TorchDiagGaussian, TorchMultiActionDistribution, \ - TorchMultiCategorical, TorchSquashedGaussian + TorchCategorical, TorchDiagGaussian, TorchGaussianSquashedGaussian, \ + TorchMultiActionDistribution, TorchMultiCategorical, TorchSquashedGaussian from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.numpy import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, \ softmax, SMALL_NUMBER, LARGE_INTEGER @@ -370,48 +370,48 @@ def test_diag_gaussian(self): check(outs, log_prob, decimals=4) def test_gaussian_squashed_gaussian(self): - for fw, sess in framework_iterator(frameworks="tf", session=True): - inputs1 = tf.constant([[-0.5, 0.2, - np.log(0.1), - np.log(0.5)], - [0.6, 0.8, - np.log(0.7), - np.log(0.8)], - [-10.0, 1.2, - np.log(0.9), - np.log(1.0)]]) - - inputs2 = tf.constant([[0.2, 0.3, - np.log(0.2), - np.log(0.4)], - [0.6, 0.8, - np.log(0.7), - np.log(0.8)], - [-11.0, 1.2, - np.log(0.9), - np.log(1.0)]]) - - gsg_dist1 = GaussianSquashedGaussian(inputs1, None) - gsg_dist2 = GaussianSquashedGaussian(inputs2, None) + for fw, sess in framework_iterator(session=True): + inputs1 = np.array( + [[-0.5, 0.2, np.log(0.1), np.log(0.5)], [ + 0.6, 0.8, np.log(0.7), np.log(0.8) + ], [-10.0, 1.2, np.log(0.9), + np.log(1.0)]], + dtype=np.float32) + + inputs2 = np.array( + [[0.2, 0.3, np.log(0.2), np.log(0.4)], [ + 0.6, 0.8, np.log(0.7), np.log(0.8) + ], [-11.0, 1.2, np.log(0.9), + np.log(1.0)]], + dtype=np.float32) + + cls = GaussianSquashedGaussian if fw != "torch" else \ + TorchGaussianSquashedGaussian + gsg_dist1 = cls(inputs1, None) + gsg_dist2 = cls(inputs2, None) # KL, entropy, and logp values have been verified empirically. check( - sess.run(gsg_dist1.kl(gsg_dist2)), np.array([6.532504, 0., - 0.])) + gsg_dist1.kl(gsg_dist2), + np.array([6.532504, 0.0, 0.0], dtype=np.float32)) check( - sess.run(gsg_dist1.entropy()), - np.array([-0.74827796, 0.7070056, -4.971432])) - x = tf.constant([[-0.3939393939393939]]) + gsg_dist1.entropy(), + np.array( + [-0.74827796, 0.7070056, -4.971432], dtype=np.float32)) + x = np.array([[-0.3939393939393939]], dtype=np.float32) + if fw == "torch": + x = torch.from_numpy(x) check( - sess.run(gsg_dist1.logp(x)), - np.array([0.736003, -3.1547096, -6.5595593])) + gsg_dist1.logp(x), + np.array([0.736003, -3.1547096, -6.5595593], dtype=np.float32)) # This is just the squashed distribution means. Verified using # _unsquash (which was itself verified as part of the logp test). - expected = np.array([[-0.41861248, - 0.1745522], [0.49179232, 0.62231755], - [-0.99906087, 0.81425166]]) - check(sess.run(gsg_dist1.deterministic_sample()), expected) + expected = np.array( + [[-0.41861248, 0.1745522], [0.49179232, 0.62231755], + [-0.99906087, 0.81425166]], + dtype=np.float32) + check(gsg_dist1.deterministic_sample(), expected) def test_beta(self): input_space = Box(-2.0, 1.0, shape=(2000, 10)) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index eb796c697ef4..9e02bd06fb36 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -385,6 +385,7 @@ class SquashedGaussian(_SquashedGaussianBase): `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. """ + @override(_SquashedGaussianBase) def _log_squash_grad(self, unsquashed_values): unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) return tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER) @@ -420,65 +421,13 @@ def required_model_output_shape( return np.prod(action_space.shape) * 2 -class Beta(TFActionDistribution): - """ - A Beta distribution is defined on the interval [0, 1] and parameterized by - shape parameters alpha and beta (also called concentration parameters). - - PDF(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z - with Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta) - and Gamma(n) = (n - 1)! - """ - - def __init__(self, - inputs: List[TensorType], - model: ModelV2, - low: float = 0.0, - high: float = 1.0): - # Stabilize input parameters (possibly coming from a linear layer). - inputs = tf.clip_by_value(inputs, log(SMALL_NUMBER), - -log(SMALL_NUMBER)) - inputs = tf.math.log(tf.math.exp(inputs) + 1.0) + 1.0 - self.low = low - self.high = high - alpha, beta = tf.split(inputs, 2, axis=-1) - # Note: concentration0==beta, concentration1=alpha (!) - self.dist = tfp.distributions.Beta( - concentration1=alpha, concentration0=beta) - super().__init__(inputs, model) - - @override(ActionDistribution) - def deterministic_sample(self) -> TensorType: - mean = self.dist.mean() - return self._squash(mean) - - @override(TFActionDistribution) - def _build_sample_op(self) -> TensorType: - return self._squash(self.dist.sample()) - - @override(ActionDistribution) - def logp(self, x: TensorType) -> TensorType: - unsquashed_values = self._unsquash(x) - return tf.math.reduce_sum( - self.dist.log_prob(unsquashed_values), axis=-1) - - def _squash(self, raw_values: TensorType) -> TensorType: - return raw_values * (self.high - self.low) + self.low - - def _unsquash(self, values: TensorType) -> TensorType: - return (values - self.low) / (self.high - self.low) - - @staticmethod - @override(ActionDistribution) - def required_model_output_shape( - action_space: gym.Space, - model_config: ModelConfigDict) -> Union[int, np.ndarray]: - return np.prod(action_space.shape) * 2 - - class GaussianSquashedGaussian(_SquashedGaussianBase): """A gaussian CDF-squashed Gaussian distribution. + Can be used instead of the `SquashedGaussian` in case entropy or KL need + to be computable in analytical form (`SquashedGaussian` can only provide + those empirically). + The distribution will never return low or high exactly, but `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. """ @@ -544,6 +493,62 @@ def required_model_output_shape( return np.prod(action_space.shape) * 2 +class Beta(TFActionDistribution): + """ + A Beta distribution is defined on the interval [0, 1] and parameterized by + shape parameters alpha and beta (also called concentration parameters). + + PDF(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z + with Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta) + and Gamma(n) = (n - 1)! + """ + + def __init__(self, + inputs: List[TensorType], + model: ModelV2, + low: float = 0.0, + high: float = 1.0): + # Stabilize input parameters (possibly coming from a linear layer). + inputs = tf.clip_by_value(inputs, log(SMALL_NUMBER), + -log(SMALL_NUMBER)) + inputs = tf.math.log(tf.math.exp(inputs) + 1.0) + 1.0 + self.low = low + self.high = high + alpha, beta = tf.split(inputs, 2, axis=-1) + # Note: concentration0==beta, concentration1=alpha (!) + self.dist = tfp.distributions.Beta( + concentration1=alpha, concentration0=beta) + super().__init__(inputs, model) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + mean = self.dist.mean() + return self._squash(mean) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + return self._squash(self.dist.sample()) + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + unsquashed_values = self._unsquash(x) + return tf.math.reduce_sum( + self.dist.log_prob(unsquashed_values), axis=-1) + + def _squash(self, raw_values: TensorType) -> TensorType: + return raw_values * (self.high - self.low) + self.low + + def _unsquash(self, values: TensorType) -> TensorType: + return (values - self.low) / (self.high - self.low) + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, + model_config: ModelConfigDict) -> Union[int, np.ndarray]: + return np.prod(action_space.shape) * 2 + + class Deterministic(TFActionDistribution): """Action distribution that returns the input values directly. @@ -573,15 +578,26 @@ def required_model_output_shape( class MultiActionDistribution(TFActionDistribution): """Action distribution that operates on a set of actions. - - Args: - inputs (Tensor list): A list of tensors from which to compute samples. """ - def __init__(self, inputs, model, *, child_distributions, input_lens, - action_space): - ActionDistribution.__init__(self, inputs, model) + def __init__(self, inputs: List[TensorType], model: ModelV2, *, + child_distributions: List[TFActionDistribution], + input_lens: List[int], action_space: gym.spaces.Space): + """Initializes a MultiActionDistribution instance. + Args: + inputs (List[TensorType): A list of tensors from which to compute + samples. + child_distributions (List[TFActionDistribution]): Flattened list + of the child distributions within this multi distribution. + input_lens (List[int]): List of input vector lengths corresponding + to the list of `child_distributions`. + action_space (gym.spaces.Space): The (Tuple/Dict) action space + underlying this multi distribution. + """ + ActionDistribution.__init__(self, inputs, model) + # The base struct (python dict/tuple) corresponding to the complex + # action space. self.action_space_struct = get_base_struct_from_space(action_space) self.input_lens = np.array(input_lens, dtype=np.int32) diff --git a/rllib/models/torch/torch_action_dist.py b/rllib/models/torch/torch_action_dist.py index ecc8aa276a3e..ff4df66710c3 100644 --- a/rllib/models/torch/torch_action_dist.py +++ b/rllib/models/torch/torch_action_dist.py @@ -184,12 +184,8 @@ def required_model_output_shape( return np.prod(action_space.shape) * 2 -class TorchSquashedGaussian(TorchDistributionWrapper): - """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. - - The distribution will never return low or high exactly, but - `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. - """ +class _TorchSquashedGaussianBase(TorchDistributionWrapper): + """A diagonal gaussian distribution, squashed into bounded support.""" def __init__(self, inputs: List[TensorType], @@ -205,49 +201,112 @@ def __init__(self, (excluding this value). """ super().__init__(inputs, model) - # Split inputs into mean and log(std). + + assert low < high + # Make sure high and low are torch tensors. + self.low = torch.from_numpy(np.array(low)) + self.high = torch.from_numpy(np.array(high)) + # Place on correct device. + if isinstance(model, TorchModelV2): + device = next(model.parameters()).device + self.low = self.low.to(device) + self.high = self.high.to(device) + mean, log_std = torch.chunk(self.inputs, 2, dim=-1) - # Clip `scale` values (coming from NN) to reasonable values. - log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) - std = torch.exp(log_std) - self.dist = torch.distributions.normal.Normal(mean, std) - assert np.all(np.less(low, high)) - self.low = low - self.high = high + self._num_vars = mean.shape[1] + assert log_std.shape[1] == self._num_vars + # Clip `std` values (coming from NN) to reasonable values. + self.log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT, + MAX_LOG_NN_OUTPUT) + # Clip loc too, for numerical stability reasons. + mean = torch.clamp(mean, -3, 3) + std = torch.exp(self.log_std) + self.distr = torch.distributions.normal.Normal(mean, std) + assert len(self.distr.loc.shape) == 2 + assert len(self.distr.scale.shape) == 2 @override(ActionDistribution) def deterministic_sample(self) -> TensorType: - self.last_sample = self._squash(self.dist.mean) - return self.last_sample - - @override(TorchDistributionWrapper) - def sample(self) -> TensorType: - # Use the reparameterization version of `dist.sample` to allow for - # the results to be backprop'able e.g. in a loss term. - normal_sample = self.dist.rsample() - self.last_sample = self._squash(normal_sample) - return self.last_sample + mean = self.distr.loc + assert len(mean.shape) == 2 + s = self._squash(mean) + assert len(s.shape) == 2 + return s @override(ActionDistribution) def logp(self, x: TensorType) -> TensorType: # Unsquash values (from [low,high] to ]-inf,inf[) + assert len(x.shape) >= 2, "First dim batch, second dim variable" unsquashed_values = self._unsquash(x) # Get log prob of unsquashed values from our Normal. - log_prob_gaussian = self.dist.log_prob(unsquashed_values) + log_prob_gaussian = self.distr.log_prob(unsquashed_values) # For safety reasons, clamp somehow, only then sum up. log_prob_gaussian = torch.clamp(log_prob_gaussian, -100, 100) - log_prob_gaussian = torch.sum(log_prob_gaussian, dim=-1) # Get log-prob for squashed Gaussian. - unsquashed_values_tanhd = torch.tanh(unsquashed_values) - log_prob = log_prob_gaussian - torch.sum( - torch.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), dim=-1) - return log_prob + return torch.sum( + log_prob_gaussian - self._log_squash_grad(unsquashed_values), + dim=-1) @override(TorchDistributionWrapper) + def sample(self): + s = self._squash(self.distr.sample()) + assert len(s.shape) == 2 + return s + + def _squash(self, unsquashed_values): + """Squash an array element-wise into the (high, low) range + + Arguments: + unsquashed_values: values to be squashed + + Returns: + The squashed values. The output shape is `unsquashed_values.shape` + + """ + raise NotImplementedError + + def _unsquash(self, values): + """Unsquash an array element-wise from the (high, low) range + + Arguments: + squashed_values: values to be unsquashed + + Returns: + The unsquashed values. The output shape is `squashed_values.shape` + + """ + raise NotImplementedError + + def _log_squash_grad(self, unsquashed_values): + """Log gradient of _squash with respect to its argument. + + Arguments: + squashed_values: Point at which to measure the gradient. + + Returns: + The gradient at the given point. The output shape is + `squashed_values.shape`. + + """ + raise NotImplementedError + + +class TorchSquashedGaussian(_TorchSquashedGaussianBase): + """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. + + The distribution will never return low or high exactly, but + `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. + """ + + def _log_squash_grad(self, unsquashed_values): + unsquashed_values_tanhd = torch.tanh(unsquashed_values) + return torch.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER) + + @override(ActionDistribution) def entropy(self) -> TensorType: raise ValueError("Entropy not defined for SquashedGaussian!") - @override(TorchDistributionWrapper) + @override(ActionDistribution) def kl(self, other: ActionDistribution) -> TensorType: raise ValueError("KL not defined for SquashedGaussian!") @@ -274,6 +333,86 @@ def required_model_output_shape( return np.prod(action_space.shape) * 2 +class TorchGaussianSquashedGaussian(_TorchSquashedGaussianBase): + """A gaussian CDF-squashed Gaussian distribution. + + Can be used instead of the `SquashedGaussian` in case entropy or KL need + to be computable in analytical form (`SquashedGaussian` can only provide + those empirically). + + The distribution will never return low or high exactly, but + `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. + """ + # Chosen to match the standard logistic variance, so that: + # Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1)) + _SCALE = 0.5 * 1.8137 + SQUASH_DIST = torch.distributions.normal.Normal(0.0, _SCALE) + + @override(_TorchSquashedGaussianBase) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.scale = torch.from_numpy(np.array(self._SCALE)) + if self.model: + self.scale = self.scale.to( + next(iter(self.model.parameters)).device) + + @override(ActionDistribution) + def kl(self, other): + # KL(self || other) is just the KL of the two unsquashed distributions. + assert isinstance(other, TorchGaussianSquashedGaussian) + + mean = self.distr.loc + std = self.distr.scale + + other_mean = other.distr.loc + other_std = other.distr.scale + + return torch.sum( + (other.log_std - self.log_std + + (torch.pow(std, 2.0) + torch.pow(mean - other_mean, 2.0)) / + (2.0 * torch.pow(other_std, 2.0)) - 0.5), + axis=1) + + def entropy(self): + # Entropy is: + # -KL(self.distr || N(0, _SCALE)) + log(high - low) + # where the latter distribution's CDF is used to do the squashing. + + mean = self.distr.loc + std = self.distr.scale + + return torch.sum( + torch.log(self.high - self.low) - + (torch.log(self.scale) - self.log_std + + (torch.pow(std, 2.0) + torch.pow(mean, 2.0)) / + (2.0 * torch.pow(self.scale, 2.0)) - 0.5), + dim=1) + + def _log_squash_grad(self, unsquashed_values): + log_grad = self.SQUASH_DIST.log_prob(value=unsquashed_values) + log_grad += torch.log(self.high - self.low) + return log_grad + + def _squash(self, raw_values): + # Make sure raw_values are not too high/low (such that tanh would + # return exactly 1.0/-1.0, which would lead to +/-inf log-probs). + + values = self.SQUASH_DIST.cdf(raw_values) # / self._SCALE) + return (torch.clamp(values, SMALL_NUMBER, 1.0 - SMALL_NUMBER) * + (self.high - self.low) + self.low) + + def _unsquash(self, values): + x = (values - self.low) / (self.high - self.low) + return self.SQUASH_DIST.icdf(x) + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, + model_config: ModelConfigDict) -> Union[int, np.ndarray]: + return np.prod(action_space.shape) * 2 + + class TorchBeta(TorchDistributionWrapper): """ A Beta distribution is defined on the interval [0, 1] and parameterized by From 37f6986dfd3c544d0dbaa2170388881e0ef9a538 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 12 Jan 2021 22:19:02 +0100 Subject: [PATCH 17/21] LINT. --- rllib/models/torch/torch_action_dist.py | 19 +++++++++++-------- rllib/tests/run_regression_tests.py | 6 +++++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/rllib/models/torch/torch_action_dist.py b/rllib/models/torch/torch_action_dist.py index ff4df66710c3..2dc7e72823d9 100644 --- a/rllib/models/torch/torch_action_dist.py +++ b/rllib/models/torch/torch_action_dist.py @@ -225,12 +225,20 @@ def __init__(self, assert len(self.distr.loc.shape) == 2 assert len(self.distr.scale.shape) == 2 + @override(TorchDistributionWrapper) + def sample(self): + s = self._squash(self.distr.sample()) + assert len(s.shape) == 2 + self.last_sample = s + return s + @override(ActionDistribution) def deterministic_sample(self) -> TensorType: mean = self.distr.loc assert len(mean.shape) == 2 s = self._squash(mean) assert len(s.shape) == 2 + self.last_sample = s return s @override(ActionDistribution) @@ -247,12 +255,6 @@ def logp(self, x: TensorType) -> TensorType: log_prob_gaussian - self._log_squash_grad(unsquashed_values), dim=-1) - @override(TorchDistributionWrapper) - def sample(self): - s = self._squash(self.distr.sample()) - assert len(s.shape) == 2 - return s - def _squash(self, unsquashed_values): """Squash an array element-wise into the (high, low) range @@ -346,7 +348,8 @@ class TorchGaussianSquashedGaussian(_TorchSquashedGaussianBase): # Chosen to match the standard logistic variance, so that: # Var(N(0, 2 * _SCALE)) = Var(Logistic(0, 1)) _SCALE = 0.5 * 1.8137 - SQUASH_DIST = torch.distributions.normal.Normal(0.0, _SCALE) + SQUASH_DIST = \ + torch.distributions.normal.Normal(0.0, _SCALE) if torch else None @override(_TorchSquashedGaussianBase) def __init__(self, *args, **kwargs): @@ -354,7 +357,7 @@ def __init__(self, *args, **kwargs): self.scale = torch.from_numpy(np.array(self._SCALE)) if self.model: self.scale = self.scale.to( - next(iter(self.model.parameters)).device) + next(iter(self.model.parameters())).device) @override(ActionDistribution) def kl(self, other): diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index 3f42147e4071..d5c2a73453a7 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -43,6 +43,10 @@ "--torch", action="store_true", help="Runs all tests with PyTorch enabled.") +parser.add_argument( + "--local-mode", + action="store_true", + help="Whether to run ray with `local_mode=True`.") if __name__ == "__main__": args = parser.parse_args() @@ -92,7 +96,7 @@ passed = False for i in range(3): try: - ray.init(num_cpus=5) + ray.init(num_cpus=5, local_mode=args.local_mode) trials = run_experiments(experiments, resume=False, verbose=2) finally: ray.shutdown() From 32f42015628c0c870031623e930c6d0a2ec51a54 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 13 Jan 2021 09:02:22 +0100 Subject: [PATCH 18/21] Fix and LINT. --- rllib/models/catalog.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 2be6446a55d1..6b7b9226058d 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -217,8 +217,10 @@ def get_action_dist( else GaussianSquashedGaussian if np.any(action_space.bounded_below & action_space.bounded_above): - if any(action_space.low != action_space.low[0]) or \ - any(action_space.high != action_space.high[0]): + lo = np.min(action_space.low) + hi = np.max(action_space.high) + if any(action_space.low != lo) or \ + any(action_space.high != hi): raise UnsupportedSpaceException( "The Box space has non-matching low/high value(s)." " Make sure that all low/high values are the same " @@ -226,10 +228,7 @@ def get_action_dist( "the different dimensions must have different " "low/high values, try splitting up your space into" " a Tuple or Dict space.") - dist_cls = partial( - cls, - low=action_space.low[0], - high=action_space.high[0]) + dist_cls = partial(cls, low=lo, high=hi) num_inputs = cls.required_model_output_shape( action_space, config) return dist_cls, num_inputs From c61739c3a28f30ff29cb87d00312f81c42c5995e Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 13 Jan 2021 13:49:36 +0100 Subject: [PATCH 19/21] wip --- rllib/BUILD | 14 +++++++------- rllib/models/catalog.py | 4 ++-- .../tests/test_model_catalog.py} | 0 3 files changed, 9 insertions(+), 9 deletions(-) rename rllib/{tests/test_catalog.py => models/tests/test_model_catalog.py} (100%) diff --git a/rllib/BUILD b/rllib/BUILD index 199cc5ad975e..7ec9225ed7a6 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1074,6 +1074,13 @@ py_test( # Tag: models # -------------------------------------------------------------------- +py_test( + name = "tests/test_model_catalog", + tags = ["models"], + size = "small", + srcs = ["models/tests/test_model_catalog.py"] +) + py_test( name = "test_convtranspose2d_stack", tags = ["models"], @@ -1222,13 +1229,6 @@ py_test( srcs = ["tests/test_attention_net_learning.py"] ) -py_test( - name = "tests/test_catalog", - tags = ["tests_dir", "tests_dir_C"], - size = "medium", - srcs = ["tests/test_catalog.py"] -) - py_test( name = "tests/test_checkpoint_restore", tags = ["tests_dir", "tests_dir_C"], diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 99d82512dc5f..ffab27c0413a 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -231,8 +231,8 @@ def get_action_dist( action_space.bounded_above): lo = np.min(action_space.low) hi = np.max(action_space.high) - if any(action_space.low != lo) or \ - any(action_space.high != hi): + if np.any(action_space.low != lo) or \ + np.any(action_space.high != hi): raise UnsupportedSpaceException( "The Box space has non-matching low/high value(s)." " Make sure that all low/high values are the same " diff --git a/rllib/tests/test_catalog.py b/rllib/models/tests/test_model_catalog.py similarity index 100% rename from rllib/tests/test_catalog.py rename to rllib/models/tests/test_model_catalog.py From ec3b6dcd39e97cdd5875ca506997f4bef73bb6ee Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 11 Apr 2021 18:38:39 +0200 Subject: [PATCH 20/21] LINT. --- rllib/models/catalog.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 21f8bb3cd8d8..c82be2585708 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -186,7 +186,7 @@ def get_action_dist( config: ModelConfigDict, dist_type: Optional[Union[str, Type[ActionDistribution]]] = None, framework: str = "tf", - **kwargs) -> (type, int): + **kwargs) -> Tuple[Type[ActionDistribution], int]: """Returns a distribution class and size for the given action space. Args: @@ -200,11 +200,9 @@ def get_action_dist( constructor. Returns: - Tuple: - - dist_class (ActionDistribution): Python class of the - distribution. - - dist_dim (int): The size of the input vector to the - distribution. + Tuple[Type[ActionDistribution], int]: Python class of the + distribution and the size of the input vector to the + distribution. """ dist_cls = None @@ -250,8 +248,8 @@ def get_action_dist( "using a Tuple action space, or the multi-agent API.") if dist_type is None: - cls = TorchGaussianSquashedGaussian if framework == "torch" \ - else GaussianSquashedGaussian + cls = TorchGaussianSquashedGaussian if \ + framework == "torch" else GaussianSquashedGaussian if np.any(action_space.bounded_below & action_space.bounded_above): lo = np.min(action_space.low) @@ -259,19 +257,20 @@ def get_action_dist( if np.any(action_space.low != lo) or \ np.any(action_space.high != hi): raise UnsupportedSpaceException( - "The Box space has non-matching low/high value(s)." - " Make sure that all low/high values are the same " - "accross the different dimensions of your Box. If " - "the different dimensions must have different " - "low/high values, try splitting up your space into" - " a Tuple or Dict space.") + "The Box space has non-matching low/high " + "value(s). Make sure that all low/high " + "values are the same accross the different " + "dimensions of your Box. If the different " + "dimensions must have different low/high " + "values, try splitting up your space into " + "a Tuple or Dict space.") dist_cls = partial(cls, low=lo, high=hi) num_inputs = cls.required_model_output_shape( action_space, config) return dist_cls, num_inputs else: - dist_cls = TorchDiagGaussian if framework == "torch" else \ - DiagGaussian + dist_cls = TorchDiagGaussian if \ + framework == "torch" else DiagGaussian elif dist_type == "deterministic": dist_cls = TorchDeterministic if framework == "torch" \ else Deterministic From 4878362f7bdca492fe8c41e81b4682aa40b6337d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 11 Apr 2021 20:16:29 +0200 Subject: [PATCH 21/21] fix and LINT. --- rllib/models/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index c82be2585708..d2817ac93265 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -186,7 +186,7 @@ def get_action_dist( config: ModelConfigDict, dist_type: Optional[Union[str, Type[ActionDistribution]]] = None, framework: str = "tf", - **kwargs) -> Tuple[Type[ActionDistribution], int]: + **kwargs) -> (Type[ActionDistribution], int): """Returns a distribution class and size for the given action space. Args: