From e8f75fd69764d6c8fecf4f12185b64cc750ba95a Mon Sep 17 00:00:00 2001 From: Tomoki Ohtsuki Date: Thu, 19 Nov 2020 18:07:26 +0900 Subject: [PATCH 1/6] Add numerically safer ordered probit distribution. --- pymc3/distributions/__init__.py | 2 + pymc3/distributions/discrete.py | 140 +++++++++++++++++++++++++++++- pymc3/distributions/dist_math.py | 40 +++++++++ pymc3/tests/test_distributions.py | 23 ++++- 4 files changed, 203 insertions(+), 2 deletions(-) diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py index 2a671d3293..b609ae5588 100644 --- a/pymc3/distributions/__init__.py +++ b/pymc3/distributions/__init__.py @@ -65,6 +65,7 @@ from .discrete import Geometric from .discrete import Categorical from .discrete import OrderedLogistic +from .discrete import OrderedProbit from .distribution import DensityDist from .distribution import Distribution @@ -143,6 +144,7 @@ "Geometric", "Categorical", "OrderedLogistic", + "OrderedProbit", "DensityDist", "Distribution", "Continuous", diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index fd2d399618..ff666ec925 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -17,7 +17,17 @@ from scipy import stats import warnings -from .dist_math import bound, factln, binomln, betaln, logpow, random_choice +from .dist_math import ( + bound, + factln, + binomln, + betaln, + logpow, + random_choice, + normal_lcdf, + normal_lccdf, + log_diff_normal_cdf, +) from .distribution import Discrete, draw_values, generate_samples from .shape_utils import broadcast_distribution_samples from pymc3.math import tround, sigmoid, logaddexp, logit, log1pexp @@ -1517,3 +1527,131 @@ def __init__(self, eta, cutpoints, *args, **kwargs): p = p_cum[..., 1:] - p_cum[..., :-1] super().__init__(p=p, *args, **kwargs) + + +class OrderedProbit(Categorical): + R""" + Ordered Probit log-likelihood. + + Useful for regression on ordinal data values whose values range + from 1 to K as a function of some predictor, :math:`\eta`. The + cutpoints, :math:`c`, separate which ranges of :math:`\eta` are + mapped to which of the K observed dependent variables. The number + of cutpoints is K - 1. It is recommended that the cutpoints are + constrained to be ordered. + + In order to stabilize the computation, log-likelihood is computed + in log space using the scaled error function `erfcx`. + + .. math:: + + f(k \mid \eta, c) = \left\{ + \begin{array}{l} + 1 - \text{normal_cdf}(0, \sigma, \eta - c_1) + \,, \text{if } k = 0 \\ + \text{normal_cdf}(0, \sigma, \eta - c_{k - 1}) - + \text{normal_cdf}(0, \sigma, \eta - c_{k}) + \,, \text{if } 0 < k < K \\ + \text{normal_cdf}(0, \sigma, \eta - c_{K - 1}) + \,, \text{if } k = K \\ + \end{array} + \right. + + Parameters + ---------- + eta : float + The predictor. + c : array + The length K - 1 array of cutpoints which break :math:`\eta` into + ranges. Do not explicitly set the first and last elements of + :math:`c` to negative and positive infinity. + + sigma: float + The standard deviation of probit function. + Example + -------- + .. code:: python + + # Generate data for a simple 1 dimensional example problem + n1_c = 300; n2_c = 300; n3_c = 300 + cluster1 = np.random.randn(n1_c) + -1 + cluster2 = np.random.randn(n2_c) + 0 + cluster3 = np.random.randn(n3_c) + 2 + + x = np.concatenate((cluster1, cluster2, cluster3)) + y = np.concatenate((1*np.ones(n1_c), + 2*np.ones(n2_c), + 3*np.ones(n3_c))) - 1 + + # Ordered logistic regression + with pm.Model() as model: + cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2, + transform=pm.distributions.transforms.ordered) + y_ = pm.OrderedProbit("y", cutpoints=cutpoints, eta=x, observed=y) + tr = pm.sample(1000) + + # Plot the results + plt.hist(cluster1, 30, alpha=0.5); + plt.hist(cluster2, 30, alpha=0.5); + plt.hist(cluster3, 30, alpha=0.5); + plt.hist(tr["cutpoints"][:,0], 80, alpha=0.2, color='k'); + plt.hist(tr["cutpoints"][:,1], 80, alpha=0.2, color='k'); + + """ + + def __init__(self, eta, cutpoints, *args, **kwargs): + + self.eta = tt.as_tensor_variable(floatX(eta)) + self.cutpoints = tt.as_tensor_variable(cutpoints) + + probits = tt.shape_padright(self.eta) - self.cutpoints + _log_p = tt.concatenate( + [ + tt.shape_padright(normal_lccdf(0, 1, probits[..., 0])), + log_diff_normal_cdf(0, 1, probits[..., :-1], probits[..., 1:]), + tt.shape_padright(normal_lcdf(0, 1, probits[..., -1])), + ], + axis=-1, + ) + _log_p = tt.as_tensor_variable(floatX(_log_p)) + + self._log_p = _log_p + self.mode = tt.argmax(_log_p, axis=-1) + p = tt.exp(_log_p) + + super().__init__(p=p, *args, **kwargs) + + def logp(self, value): + r""" + Calculate log-probability of Ordered Probit distribution at specified value. + + Parameters + ---------- + value: numeric + Value(s) for which log-probability is calculated. If the log probabilities for multiple + values are desired the values must be provided in a numpy array or theano tensor + + Returns + ------- + TensorVariable + """ + logp = self._log_p + k = self.k + + # Clip values before using them for indexing + value_clip = tt.clip(value, 0, k - 1) + + if logp.ndim > 1: + if logp.ndim > value_clip.ndim: + value_clip = tt.shape_padleft(value_clip, logp.ndim - value_clip.ndim) + elif logp.ndim < value_clip.ndim: + logp = tt.shape_padleft(logp, value_clip.ndim - logp.ndim) + pattern = (logp.ndim - 1,) + tuple(range(logp.ndim - 1)) + a = take_along_axis( + logp.dimshuffle(pattern), + value_clip, + ) + else: + a = logp[value_clip] + + return bound(a, value >= 0, value <= (k - 1)) diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py index 1af8e86c72..f71838c245 100644 --- a/pymc3/distributions/dist_math.py +++ b/pymc3/distributions/dist_math.py @@ -133,6 +133,46 @@ def normal_lccdf(mu, sigma, x): ) +def log_diff_normal_cdf(mu, sigma, x, y): + """ + Compute :math:`log(\Phi(\frac{x - \mu}{\sigma}) - \Phi(\frac{y - \mu}{\sigma}))` safely in log space. + + Parameters + ---------- + mu: float + mean + sigma: float + std + + x: float + + y: float + must be strictly less than x. + + Returns + ------- + log (\Phi(x) - \Phi(y)) + + """ + x = (x - mu) / sigma / tt.sqrt(2.0) + y = (y - mu) / sigma / tt.sqrt(2.0) + + # To stabilize the computation, consider these three regions: + # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y) + # 2) 0 > x > 0 => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y) + # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine. + return tt.log(0.5) + tt.switch( + tt.gt(y, 0), + -tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)), + tt.switch( + tt.lt(x, 0), # 0 > x > y + -tt.square(x) + + tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)), + tt.log(tt.erf(x) - tt.erf(y)), # x >0 > y + ), + ) + + def sigma2rho(sigma): """ `sigma -> rho` theano converter diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 20349d104f..a0c5522724 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -66,6 +66,7 @@ Gumbel, Logistic, OrderedLogistic, + OrderedProbit, LogitNormal, Interpolated, ZeroInflatedBinomial, @@ -89,7 +90,7 @@ from scipy import integrate import scipy.stats.distributions as sp import scipy.stats -from scipy.special import logit +from scipy.special import logit, erf, erfcx import theano import theano.tensor as tt from ..math import kronecker @@ -429,6 +430,17 @@ def orderedlogistic_logpdf(value, eta, cutpoints): return np.where(np.all(ps >= 0), np.log(p), -np.inf) +def invprobit(x): + return (erf(x / np.sqrt(2)) + 1) / 2 + + +def orderedprobit_logpdf(value, eta, cutpoints): + c = np.concatenate(([-np.inf], cutpoints, [np.inf])) + ps = np.array([invprobit(eta - cc) - invprobit(eta - cc1) for cc, cc1 in zip(c[:-1], c[1:])]) + p = ps[value] + return np.where(np.all(ps >= 0), np.log(p), -np.inf) + + class Simplex: def __init__(self, n): self.vals = list(simplex_values(n)) @@ -1516,6 +1528,15 @@ def test_orderedlogistic(self, n): lambda value, eta, cutpoints: orderedlogistic_logpdf(value, eta, cutpoints), ) + @pytest.mark.parametrize("n", [2, 3, 4]) + def test_orderedprobit(self, n): + self.pymc3_matches_scipy( + OrderedProbit, + Domain(range(n), "int64"), + {"eta": R, "cutpoints": SortedVector(n - 1)}, + lambda value, eta, cutpoints: orderedprobit_logpdf(value, eta, cutpoints), + ) + def test_densitydist(self): def logp(x): return -log(2 * 0.5) - abs(x - 0.5) / 0.5 From 53c2a0da03cc992a9370afa3049efeeed211a167 Mon Sep 17 00:00:00 2001 From: Tomoki Ohtsuki Date: Thu, 3 Dec 2020 13:42:22 +0900 Subject: [PATCH 2/6] Fix styling issue. --- pymc3/distributions/dist_math.py | 4 ++-- pymc3/tests/test_distributions.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py index f71838c245..55df628be6 100644 --- a/pymc3/distributions/dist_math.py +++ b/pymc3/distributions/dist_math.py @@ -135,7 +135,7 @@ def normal_lccdf(mu, sigma, x): def log_diff_normal_cdf(mu, sigma, x, y): """ - Compute :math:`log(\Phi(\frac{x - \mu}{\sigma}) - \Phi(\frac{y - \mu}{\sigma}))` safely in log space. + Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space. Parameters ---------- @@ -151,7 +151,7 @@ def log_diff_normal_cdf(mu, sigma, x, y): Returns ------- - log (\Phi(x) - \Phi(y)) + log (\\Phi(x) - \\Phi(y)) """ x = (x - mu) / sigma / tt.sqrt(2.0) diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 44e1ad43cd..be54e63b51 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -90,7 +90,7 @@ from scipy import integrate import scipy.stats.distributions as sp import scipy.stats -from scipy.special import logit, erf, erfcx +from scipy.special import logit, erf import theano import theano.tensor as tt from ..math import kronecker From c3615bbf5bb8eb833208af5680a3216b5b2d4080 Mon Sep 17 00:00:00 2001 From: Tomoki Ohtsuki Date: Thu, 3 Dec 2020 15:32:00 +0900 Subject: [PATCH 3/6] Use smaller values for probit & cutpoint testing --- pymc3/tests/test_distributions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index be54e63b51..8df69656e9 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -1543,7 +1543,7 @@ def test_orderedprobit(self, n): self.pymc3_matches_scipy( OrderedProbit, Domain(range(n), "int64"), - {"eta": R, "cutpoints": SortedVector(n - 1)}, + {"eta": Runif, "cutpoints": UnitSortedVector(n - 1)}, lambda value, eta, cutpoints: orderedprobit_logpdf(value, eta, cutpoints), ) From cef5bed06c7d3688a22938c539a1f04b7e1bd7fb Mon Sep 17 00:00:00 2001 From: Michael Osthege Date: Tue, 8 Dec 2020 20:46:34 +0100 Subject: [PATCH 4/6] fix import sorting --- pymc3/distributions/discrete.py | 2 +- pymc3/tests/test_distributions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index d98e9617d8..9662ff2693 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -26,8 +26,8 @@ factln, log_diff_normal_cdf, logpow, - normal_lcdf, normal_lccdf, + normal_lcdf, random_choice, ) from pymc3.distributions.distribution import Discrete, draw_values, generate_samples diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 32281e687d..daffdbe8e1 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -26,7 +26,7 @@ from numpy import array, exp, inf, log from numpy.testing import assert_allclose, assert_almost_equal, assert_equal from scipy import integrate -from scipy.special import logit, erf +from scipy.special import erf, logit import pymc3 as pm From 65b5a4d6a5ba7da375241a016a350d62ec2b31c0 Mon Sep 17 00:00:00 2001 From: Tomoki Ohtsuki Date: Wed, 16 Dec 2020 10:46:59 +0900 Subject: [PATCH 5/6] Typo fix --- pymc3/distributions/discrete.py | 2 +- pymc3/distributions/dist_math.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 9662ff2693..34fb39a48f 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -1697,7 +1697,7 @@ class OrderedProbit(Categorical): 2*np.ones(n2_c), 3*np.ones(n3_c))) - 1 - # Ordered logistic regression + # Ordered probit regression with pm.Model() as model: cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2, transform=pm.distributions.transforms.ordered) diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py index 99bf11d2dc..e302000735 100644 --- a/pymc3/distributions/dist_math.py +++ b/pymc3/distributions/dist_math.py @@ -160,7 +160,7 @@ def log_diff_normal_cdf(mu, sigma, x, y): # To stabilize the computation, consider these three regions: # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y) - # 2) 0 > x > 0 => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y) + # 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y) # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine. return tt.log(0.5) + tt.switch( tt.gt(y, 0), From 7b27830aa7971d8180497b52f7a73fe2b60146e0 Mon Sep 17 00:00:00 2001 From: Tomoki Ohtsuki Date: Wed, 16 Dec 2020 10:55:14 +0900 Subject: [PATCH 6/6] add a line to RELEASE-NOTES.md --- RELEASE-NOTES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 8858828c06..0700dd84db 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -9,6 +9,8 @@ This is the first release to support Python3.9 and to drop Python3.6. - Make `sample_shape` same across all contexts in `draw_values` (see [#4305](https://github.com/pymc-devs/pymc3/pull/4305)). - Removed `theanof.set_theano_config` because it illegally touched Theano's privates (see [#4329](https://github.com/pymc-devs/pymc3/pull/4329)). +### New Features +- `OrderedProbit` distribution added (see [#4232](https://github.com/pymc-devs/pymc3/pull/4232)). ## PyMC3 3.10.0 (7 December 2020)