From 13258dc1075b6c1e1afe94c9b95b2b8f0bd236bf Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 19:46:18 +0200 Subject: [PATCH 01/17] Add optimized ECDF confidence bands --- arviz/stats/ecdf_utils.py | 93 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index 66658d659c..0ad85d5cc0 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -5,6 +5,7 @@ import numpy as np from scipy.stats import uniform, binom +from scipy.optimize import minimize_scalar def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray: @@ -92,6 +93,7 @@ def ecdf_confidence_band( method : string, default "simulated" The method used to compute the confidence band. Valid options are: - "pointwise": Compute the pointwise (i.e. marginal) confidence band. + - "optimized": Use optimization to estimate a simultaneous confidence band. - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band. `rvs` must be provided. rvs: callable, optional @@ -115,12 +117,16 @@ def ecdf_confidence_band( if method == "pointwise": prob_pointwise = prob + elif method == "optimized": + prob_pointwise = _optimize_simultaneous_ecdf_band_probability( + ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs + ) elif method == "simulated": prob_pointwise = _simulate_simultaneous_ecdf_band_probability( ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs ) else: - raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.") + raise ValueError(f"Unknown method {method}. Valid options are 'pointwise', 'optimized', or 'simulated'.") prob_lower, prob_upper = _get_pointwise_confidence_band( prob_pointwise, ndraws, cdf_at_eval_points @@ -129,6 +135,91 @@ def ecdf_confidence_band( return prob_lower, prob_upper +def _update_interior_probabilities( + prob_left: np.ndarray, + interval_left: np.ndarray, + interval_right: np.ndarray, + cdf_left: float, + cdf_right: float, + ndraws: int, +) -> np.ndarray: + """Update the probability that an ECDF has been within the envelope including at the current point. + + Arguments + --------- + prob_left : np.ndarray + For each point in the interior at the previous point, the joint probability that it and all + points before are in the interior. + interval_left : np.ndarray + The set of points in the interior at the previous point. + interval_right : np.ndarray + The set of points in the interior at the current point. + cdf_left : float + The CDF at the previous point. + cdf_right : float + The CDF at the current point. + ndraws : int + Number of draws in the original dataset. + + Returns + ------- + prob_right : np.ndarray + For each point in the interior at the current point, the joint probability that it and all + previous points are in the interior. + """ + z_tilde = (cdf_right - cdf_left) / (1 - cdf_left) + interval_left = interval_left[:, np.newaxis] + prob_conditional = binom.pmf( + interval_right, ndraws - interval_left, z_tilde, loc=interval_left + ) + prob_right = prob_left.dot(prob_conditional) + return prob_right + + +def _band_optimization_objective( + prob_pointwise: float, + cdf_at_eval_points: np.ndarray, + ndraws: int, + prob_target: float, +) -> float: + """Objective function for optimizing the simultaneous confidence band probability.""" + lower, upper = _get_pointwise_confidence_band(prob_pointwise, ndraws, cdf_at_eval_points) + lower = (lower * ndraws).astype(int) + upper = (upper * ndraws).astype(int) + + interval_left = np.zeros(1) + cdf_left = 0 + prob_interior = np.ones(1) + for i in range(cdf_at_eval_points.shape[0]): + interval_right = np.arange(lower[i], upper[i] + 1) + cdf_right = cdf_at_eval_points[i] + prob_interior = _update_interior_probabilities( + prob_interior, interval_left, interval_right, cdf_left, cdf_right, ndraws + ) + interval_left = interval_right + cdf_left = cdf_right + prob_interior = prob_interior.sum() + return abs(prob_interior - prob_target) + + +def _optimize_simultaneous_ecdf_band_probability( + ndraws: int, + eval_points: np.ndarray, + cdf_at_eval_points: np.ndarray, + prob: float = 0.95, + **kwargs, +): + """Estimate probability for simultaneous confidence band using optimization. + + This function simulates the pointwise probability needed to construct pointwise confidence bands + that form a `prob`-level confidence envelope for the ECDF of a sample. + """ + cdf_at_eval_points = np.unique(cdf_at_eval_points) + objective = lambda p: _band_optimization_objective(p, cdf_at_eval_points, ndraws, prob) + prob_pointwise = minimize_scalar(objective, bounds=(prob, 1), method="bounded").x + return prob_pointwise + + def _simulate_simultaneous_ecdf_band_probability( ndraws: int, eval_points: np.ndarray, From 3e17e5c134dac6356f68bb6e011e8cdd63aedd9b Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 19:46:36 +0200 Subject: [PATCH 02/17] Test ECDF optimized confidence bands --- arviz/tests/base_tests/test_stats_ecdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py index b6d5563464..bdb36a1804 100644 --- a/arviz/tests/base_tests/test_stats_ecdf_utils.py +++ b/arviz/tests/base_tests/test_stats_ecdf_utils.py @@ -109,7 +109,7 @@ def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, see ids=["continuous", "continuous default rvs", "discrete"], ) @pytest.mark.parametrize("ndraws", [10_000]) -@pytest.mark.parametrize("method", ["pointwise", "simulated"]) +@pytest.mark.parametrize("method", ["pointwise", "optimized", "simulated"]) def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57): """Test test_ecdf_confidence_band.""" eval_points = np.linspace(*dist.interval(0.99), 10) From 5c6b09185eca28dda040c2532d9c5e6ebd869d02 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 19:46:49 +0200 Subject: [PATCH 03/17] Default to optimized confidence bands --- arviz/stats/ecdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index 0ad85d5cc0..bdd8faf00c 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -74,7 +74,7 @@ def ecdf_confidence_band( eval_points: np.ndarray, cdf_at_eval_points: np.ndarray, prob: float = 0.95, - method="simulated", + method="optimized", **kwargs, ) -> Tuple[np.ndarray, np.ndarray]: """Compute the `prob`-level confidence band for the ECDF. From aac15cce0d705054097f7a2a5e9ab9ec6e9574bc Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 19:50:38 +0200 Subject: [PATCH 04/17] Use more descriptive names --- arviz/stats/ecdf_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index bdd8faf00c..c6d93fa0c9 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -135,7 +135,7 @@ def ecdf_confidence_band( return prob_lower, prob_upper -def _update_interior_probabilities( +def _update_ecdf_band_interior_probabilities( prob_left: np.ndarray, interval_left: np.ndarray, interval_right: np.ndarray, @@ -176,7 +176,7 @@ def _update_interior_probabilities( return prob_right -def _band_optimization_objective( +def _ecdf_band_optimization_objective( prob_pointwise: float, cdf_at_eval_points: np.ndarray, ndraws: int, @@ -184,16 +184,16 @@ def _band_optimization_objective( ) -> float: """Objective function for optimizing the simultaneous confidence band probability.""" lower, upper = _get_pointwise_confidence_band(prob_pointwise, ndraws, cdf_at_eval_points) - lower = (lower * ndraws).astype(int) - upper = (upper * ndraws).astype(int) + lower_count = (lower * ndraws).astype(int) + upper_count = (upper * ndraws).astype(int) + 1 interval_left = np.zeros(1) cdf_left = 0 prob_interior = np.ones(1) for i in range(cdf_at_eval_points.shape[0]): - interval_right = np.arange(lower[i], upper[i] + 1) + interval_right = np.arange(lower_count[i], upper_count[i]) cdf_right = cdf_at_eval_points[i] - prob_interior = _update_interior_probabilities( + prob_interior = _update_ecdf_band_interior_probabilities( prob_interior, interval_left, interval_right, cdf_left, cdf_right, ndraws ) interval_left = interval_right @@ -215,7 +215,7 @@ def _optimize_simultaneous_ecdf_band_probability( that form a `prob`-level confidence envelope for the ECDF of a sample. """ cdf_at_eval_points = np.unique(cdf_at_eval_points) - objective = lambda p: _band_optimization_objective(p, cdf_at_eval_points, ndraws, prob) + objective = lambda p: _ecdf_band_optimization_objective(p, cdf_at_eval_points, ndraws, prob) prob_pointwise = minimize_scalar(objective, bounds=(prob, 1), method="bounded").x return prob_pointwise From 23c37d5e4c693567c1d3c760e8d66915a95ab999 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 20:02:31 +0200 Subject: [PATCH 05/17] Support optimized ECDF bands in plots --- arviz/plots/ecdfplot.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py index 9fe136c718..624f37c98e 100644 --- a/arviz/plots/ecdfplot.py +++ b/arviz/plots/ecdfplot.py @@ -73,6 +73,7 @@ def plot_ecdf( - False: No confidence bands are plotted (default). - True: Plot bands computed with the default algorithm (subject to change) - "pointwise": Compute the pointwise (i.e. marginal) confidence band. + - "optimized": Use optimization to estimate a simultaneous confidence band. - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band. @@ -238,9 +239,10 @@ def plot_ecdf( ) confidence_bands = "pointwise" else: - confidence_bands = "simulated" - elif confidence_bands == "simulated" and pointwise: - raise ValueError("Cannot specify both `confidence_bands='simulated'` and `pointwise=True`") + confidence_bands = "optimized" + # if pointwise especified, confidence_bands must be a bool or 'pointwise' + elif confidence_bands not in [False, "pointwise"] and pointwise: + raise ValueError(f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`") if fpr is not None: warnings.warn( @@ -298,7 +300,7 @@ def plot_ecdf( "`eval_points` explicitly.", BehaviourChangeWarning, ) - if confidence_bands == "simulated": + if confidence_bands in ["optimized", "simulated"]: warnings.warn( "For simultaneous bands to be correctly calibrated, specify `eval_points` " "independent of the `values`" From 70e2ee6d035516619c6aeb345228432db2f3d50e Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 20:02:50 +0200 Subject: [PATCH 06/17] Test optimized ECDF bands in plots --- arviz/tests/base_tests/test_plots_matplotlib.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py index 1183812db4..a7510ed4d1 100644 --- a/arviz/tests/base_tests/test_plots_matplotlib.py +++ b/arviz/tests/base_tests/test_plots_matplotlib.py @@ -1285,7 +1285,7 @@ def test_plot_ecdf_eval_points(): assert axes is not None -@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "simulated"]) +@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "optimized", "simulated"]) def test_plot_ecdf_confidence_bands(confidence_bands): """Check that all confidence_bands values correctly accepted""" data = np.random.randn(4, 1000) @@ -1326,6 +1326,8 @@ def test_plot_ecdf_error(): # contradictory confidence band types with pytest.raises(ValueError): plot_ecdf(data, cdf=dist.cdf, confidence_bands="simulated", pointwise=True) + with pytest.raises(ValueError): + plot_ecdf(data, cdf=dist.cdf, confidence_bands="optimized", pointwise=True) plot_ecdf(data, cdf=dist.cdf, confidence_bands=True, pointwise=True) plot_ecdf(data, cdf=dist.cdf, confidence_bands="pointwise") From e0c43be3adcd2c358f8a38683f72b43d9568767a Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 20:03:53 +0200 Subject: [PATCH 07/17] Run black on new code --- arviz/plots/ecdfplot.py | 4 +++- arviz/stats/ecdf_utils.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py index 624f37c98e..db1747ddac 100644 --- a/arviz/plots/ecdfplot.py +++ b/arviz/plots/ecdfplot.py @@ -242,7 +242,9 @@ def plot_ecdf( confidence_bands = "optimized" # if pointwise especified, confidence_bands must be a bool or 'pointwise' elif confidence_bands not in [False, "pointwise"] and pointwise: - raise ValueError(f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`") + raise ValueError( + f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`" + ) if fpr is not None: warnings.warn( diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index c6d93fa0c9..98e84fb682 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -126,7 +126,9 @@ def ecdf_confidence_band( ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs ) else: - raise ValueError(f"Unknown method {method}. Valid options are 'pointwise', 'optimized', or 'simulated'.") + raise ValueError( + f"Unknown method {method}. Valid options are 'pointwise', 'optimized', or 'simulated'." + ) prob_lower, prob_upper = _get_pointwise_confidence_band( prob_pointwise, ndraws, cdf_at_eval_points @@ -150,9 +152,9 @@ def _update_ecdf_band_interior_probabilities( prob_left : np.ndarray For each point in the interior at the previous point, the joint probability that it and all points before are in the interior. - interval_left : np.ndarray + interval_left : np.ndarray The set of points in the interior at the previous point. - interval_right : np.ndarray + interval_right : np.ndarray The set of points in the interior at the current point. cdf_left : float The CDF at the previous point. @@ -169,9 +171,7 @@ def _update_ecdf_band_interior_probabilities( """ z_tilde = (cdf_right - cdf_left) / (1 - cdf_left) interval_left = interval_left[:, np.newaxis] - prob_conditional = binom.pmf( - interval_right, ndraws - interval_left, z_tilde, loc=interval_left - ) + prob_conditional = binom.pmf(interval_right, ndraws - interval_left, z_tilde, loc=interval_left) prob_right = prob_left.dot(prob_conditional) return prob_right From 2dfa73fb96b2b001bd8f3ec7167d30f43c64f577 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 21:34:49 +0200 Subject: [PATCH 08/17] Drop unnecessary rvs keyword from examples --- arviz/plots/ecdfplot.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py index db1747ddac..818640417d 100644 --- a/arviz/plots/ecdfplot.py +++ b/arviz/plots/ecdfplot.py @@ -217,8 +217,7 @@ def plot_ecdf( >>> pit_vals = distribution.cdf(sample) >>> uniform_dist = uniform(0, 1) >>> az.plot_ecdf( - >>> pit_vals, cdf=uniform_dist.cdf, - >>> rvs=uniform_dist.rvs, confidence_bands=True + >>> pit_vals, cdf=uniform_dist.cdf, confidence_bands=True, >>> ) Plot an ECDF-difference plot of PIT values. @@ -227,8 +226,8 @@ def plot_ecdf( :context: close-figs >>> az.plot_ecdf( - >>> pit_vals, cdf = uniform_dist.cdf, rvs = uniform_dist.rvs, - >>> confidence_bands = True, difference = True + >>> pit_vals, cdf = uniform_dist.cdf, confidence_bands = True, + >>> difference = True >>> ) """ if confidence_bands is True: From 3f8704e1dc6ff9bb76b82a57ed1f1e97126c3045 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 22:19:53 +0200 Subject: [PATCH 09/17] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ecd1fecce..3b2625f86f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## v0.x.x Unreleased ### New features +- Add optimized simultaneous ECDF confidence bands ([2368](https://github.com/arviz-devs/arviz/pull/2368)) ### Maintenance and fixes From 52a483d5711cb033a8d3be26fbd9ce06e6652e2e Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 19 Aug 2024 23:01:03 +0200 Subject: [PATCH 10/17] Fix or disable linting errors --- arviz/stats/ecdf_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index 98e84fb682..d36bdc90c7 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -145,7 +145,8 @@ def _update_ecdf_band_interior_probabilities( cdf_right: float, ndraws: int, ) -> np.ndarray: - """Update the probability that an ECDF has been within the envelope including at the current point. + """Update the probability that an ECDF has been within the envelope including at the current + point. Arguments --------- @@ -204,10 +205,10 @@ def _ecdf_band_optimization_objective( def _optimize_simultaneous_ecdf_band_probability( ndraws: int, - eval_points: np.ndarray, + eval_points: np.ndarray, # pylint: disable=unused-argument cdf_at_eval_points: np.ndarray, prob: float = 0.95, - **kwargs, + **kwargs, # pylint: disable=unused-argument ): """Estimate probability for simultaneous confidence band using optimization. From a583babbff9cc9f08f8dac9efe8d26b86fe21c8c Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 22 Aug 2024 15:48:47 +0200 Subject: [PATCH 11/17] Speed up optimized bands with numba if available --- arviz/stats/ecdf_utils.py | 110 ++++++++++++++---- .../tests/base_tests/test_stats_ecdf_utils.py | 15 ++- 2 files changed, 104 insertions(+), 21 deletions(-) diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index d36bdc90c7..9afaeb4951 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -1,5 +1,6 @@ """Functions for evaluating ECDFs and their confidence bands.""" +import math from typing import Any, Callable, Optional, Tuple import warnings @@ -7,6 +8,19 @@ from scipy.stats import uniform, binom from scipy.optimize import minimize_scalar +try: + from numba import jit, vectorize +except ImportError: + + def jit(*args, **kwargs): + return lambda f: f + + def vectorize(*args, **kwargs): + return lambda f: f + + +from ..utils import Numba + def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray: """Compute ECDF of the sorted `sample` at the evaluation points.""" @@ -141,8 +155,7 @@ def _update_ecdf_band_interior_probabilities( prob_left: np.ndarray, interval_left: np.ndarray, interval_right: np.ndarray, - cdf_left: float, - cdf_right: float, + p: float, ndraws: int, ) -> np.ndarray: """Update the probability that an ECDF has been within the envelope including at the current @@ -157,10 +170,8 @@ def _update_ecdf_band_interior_probabilities( The set of points in the interior at the previous point. interval_right : np.ndarray The set of points in the interior at the current point. - cdf_left : float - The CDF at the previous point. - cdf_right : float - The CDF at the current point. + p : float + The probability of any given point found between the previous point and the current one. ndraws : int Number of draws in the original dataset. @@ -170,13 +181,75 @@ def _update_ecdf_band_interior_probabilities( For each point in the interior at the current point, the joint probability that it and all previous points are in the interior. """ - z_tilde = (cdf_right - cdf_left) / (1 - cdf_left) interval_left = interval_left[:, np.newaxis] - prob_conditional = binom.pmf(interval_right, ndraws - interval_left, z_tilde, loc=interval_left) + prob_conditional = binom.pmf(interval_right, ndraws - interval_left, p, loc=interval_left) + prob_right = prob_left.dot(prob_conditional) + return prob_right + + +@jit(nopython=True) +def _lbinom(n, k): + k = min(k, n - k) # Take advantage of symmetry + if k < 0: + return 0.0 + if k == 0: + return 1.0 + return math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1) + + +@vectorize(["float64(int64, int64, float64, int64)"]) +def _binom_pmf(k, n, p, loc): + k -= loc + if k == 0: + return (1 - p) ** n + if k == n: + return p**k + if p == 0: + return 1.0 if k == 0 else 0.0 + if p == 1: + return 1.0 if k == n else 0.0 + return np.exp(_lbinom(n, k) + k * np.log(p) + (n - k) * np.log1p(-p)) + + +@jit(nopython=True) +def _update_ecdf_band_interior_probabilities_numba( + prob_left: np.ndarray, + interval_left: np.ndarray, + interval_right: np.ndarray, + p: float, + ndraws: int, +) -> np.ndarray: + interval_left = interval_left[:, np.newaxis] + prob_conditional = _binom_pmf(interval_right, ndraws - interval_left, p, interval_left) prob_right = prob_left.dot(prob_conditional) return prob_right +def _ecdf_band_interior_probability(prob_between_points, ndraws, lower_count, upper_count): + interval_left = np.arange(1) + prob_interior = np.ones(1) + for i in range(prob_between_points.shape[0]): + interval_right = np.arange(lower_count[i], upper_count[i]) + prob_interior = _update_ecdf_band_interior_probabilities( + prob_interior, interval_left, interval_right, prob_between_points[i], ndraws + ) + interval_left = interval_right + return prob_interior.sum() + + +@jit(nopython=True) +def _ecdf_band_interior_probability_numba(prob_between_points, ndraws, lower_count, upper_count): + interval_left = np.arange(1) + prob_interior = np.ones(1) + for i in range(prob_between_points.shape[0]): + interval_right = np.arange(lower_count[i], upper_count[i]) + prob_interior = _update_ecdf_band_interior_probabilities_numba( + prob_interior, interval_left, interval_right, prob_between_points[i], ndraws + ) + interval_left = interval_right + return prob_interior.sum() + + def _ecdf_band_optimization_objective( prob_pointwise: float, cdf_at_eval_points: np.ndarray, @@ -187,19 +260,16 @@ def _ecdf_band_optimization_objective( lower, upper = _get_pointwise_confidence_band(prob_pointwise, ndraws, cdf_at_eval_points) lower_count = (lower * ndraws).astype(int) upper_count = (upper * ndraws).astype(int) + 1 - - interval_left = np.zeros(1) - cdf_left = 0 - prob_interior = np.ones(1) - for i in range(cdf_at_eval_points.shape[0]): - interval_right = np.arange(lower_count[i], upper_count[i]) - cdf_right = cdf_at_eval_points[i] - prob_interior = _update_ecdf_band_interior_probabilities( - prob_interior, interval_left, interval_right, cdf_left, cdf_right, ndraws + cdf_with_zero = np.insert(cdf_at_eval_points[:-1], 0, 0) + prob_between_points = (cdf_at_eval_points - cdf_with_zero) / (1 - cdf_with_zero) + if Numba.numba_flag: + prob_interior = _ecdf_band_interior_probability_numba( + prob_between_points, ndraws, lower_count, upper_count + ) + else: + prob_interior = _ecdf_band_interior_probability( + prob_between_points, ndraws, lower_count, upper_count ) - interval_left = interval_right - cdf_left = cdf_right - prob_interior = prob_interior.sum() return abs(prob_interior - prob_target) diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py index bdb36a1804..e75b0de2e3 100644 --- a/arviz/tests/base_tests/test_stats_ecdf_utils.py +++ b/arviz/tests/base_tests/test_stats_ecdf_utils.py @@ -10,6 +10,13 @@ _get_pointwise_confidence_band, ) +try: + import numba + + numba_options = [True, False] +except ImportError: + numba_options = [False] + def test_compute_ecdf(): """Test compute_ecdf function.""" @@ -110,8 +117,14 @@ def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, see ) @pytest.mark.parametrize("ndraws", [10_000]) @pytest.mark.parametrize("method", ["pointwise", "optimized", "simulated"]) -def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57): +@pytest.mark.parametrize("use_numba", numba_options) +def test_ecdf_confidence_band( + dist, rvs, prob, ndraws, method, use_numba, num_trials=1_000, seed=57 +): """Test test_ecdf_confidence_band.""" + if use_numba and method != "optimized": + pytest.skip("Numba only used in optimized method") + eval_points = np.linspace(*dist.interval(0.99), 10) cdf_at_eval_points = dist.cdf(eval_points) random_state = np.random.default_rng(seed) From bbb29f4f9c0c5ae27f9ac9039a578332611db0c4 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 22 Aug 2024 20:12:39 +0200 Subject: [PATCH 12/17] Make functions work even when jit is disabled --- arviz/stats/ecdf_utils.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index 9afaeb4951..f02c3359fb 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -187,28 +187,21 @@ def _update_ecdf_band_interior_probabilities( return prob_right -@jit(nopython=True) -def _lbinom(n, k): - k = min(k, n - k) # Take advantage of symmetry - if k < 0: - return 0.0 - if k == 0: - return 1.0 - return math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1) - - @vectorize(["float64(int64, int64, float64, int64)"]) def _binom_pmf(k, n, p, loc): k -= loc - if k == 0: - return (1 - p) ** n - if k == n: - return p**k + if k < 0 or k > n: + return 0.0 if p == 0: return 1.0 if k == 0 else 0.0 if p == 1: return 1.0 if k == n else 0.0 - return np.exp(_lbinom(n, k) + k * np.log(p) + (n - k) * np.log1p(-p)) + if k == 0: + return (1 - p) ** n + if k == n: + return p**n + lbinom = math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1) + return np.exp(lbinom + k * np.log(p) + (n - k) * np.log1p(-p)) @jit(nopython=True) From d22711827b53fb5d62158d67f9b83c50d854e526 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 22 Aug 2024 20:45:29 +0200 Subject: [PATCH 13/17] Fix pylint errors --- arviz/stats/ecdf_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index f02c3359fb..1985446c68 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -12,10 +12,10 @@ from numba import jit, vectorize except ImportError: - def jit(*args, **kwargs): + def jit(*args, **kwargs): # pylint: disable=unused-argument return lambda f: f - def vectorize(*args, **kwargs): + def vectorize(*args, **kwargs): # pylint: disable=unused-argument return lambda f: f From 003946c6bc4349a06a3e67532563545c174212e2 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 26 Aug 2024 21:45:13 +0200 Subject: [PATCH 14/17] Disable pylint warning --- arviz/tests/base_tests/test_stats_ecdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py index e75b0de2e3..6c2c3e225e 100644 --- a/arviz/tests/base_tests/test_stats_ecdf_utils.py +++ b/arviz/tests/base_tests/test_stats_ecdf_utils.py @@ -11,7 +11,7 @@ ) try: - import numba + import numba # pylint: disable=unused-import numba_options = [True, False] except ImportError: From 020dff80195233e089e7766fb971d91e4ce2930d Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 27 Aug 2024 11:02:45 +0200 Subject: [PATCH 15/17] Add heuristic for switching between band methods --- arviz/plots/ecdfplot.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py index 818640417d..bc92f48e5f 100644 --- a/arviz/plots/ecdfplot.py +++ b/arviz/plots/ecdfplot.py @@ -238,8 +238,8 @@ def plot_ecdf( ) confidence_bands = "pointwise" else: - confidence_bands = "optimized" - # if pointwise especified, confidence_bands must be a bool or 'pointwise' + confidence_bands = "auto" + # if pointwise specified, confidence_bands must be a bool or 'pointwise' elif confidence_bands not in [False, "pointwise"] and pointwise: raise ValueError( f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`" @@ -322,6 +322,11 @@ def plot_ecdf( if confidence_bands: ndraws = len(values) + if confidence_bands == "auto": + if ndraws < 200 or num_trials >= 250 * np.sqrt(ndraws): + confidence_bands = "optimized" + else: + confidence_bands = "simulated" x_bands = eval_points lower, higher = ecdf_confidence_band( ndraws, From a209ef4e53edc9546423fc0cdf9ed2deb712854e Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 27 Aug 2024 11:07:33 +0200 Subject: [PATCH 16/17] Add test for low and high number of draws --- arviz/tests/base_tests/test_plots_matplotlib.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py index a7510ed4d1..5afcbe1669 100644 --- a/arviz/tests/base_tests/test_plots_matplotlib.py +++ b/arviz/tests/base_tests/test_plots_matplotlib.py @@ -1286,9 +1286,10 @@ def test_plot_ecdf_eval_points(): @pytest.mark.parametrize("confidence_bands", [True, "pointwise", "optimized", "simulated"]) -def test_plot_ecdf_confidence_bands(confidence_bands): +@pytest.mark.parametrize("ndraws", [100, 10_000]) +def test_plot_ecdf_confidence_bands(confidence_bands, ndraws): """Check that all confidence_bands values correctly accepted""" - data = np.random.randn(4, 1000) + data = np.random.randn(4, ndraws // 4) axes = plot_ecdf(data, confidence_bands=confidence_bands, cdf=norm(0, 1).cdf) assert axes is not None From 054efa6eb326985c63e8d230a267733613501773 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 27 Aug 2024 11:30:08 +0200 Subject: [PATCH 17/17] Run black --- arviz/tests/base_tests/test_stats_ecdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py index 6c2c3e225e..68d1e98189 100644 --- a/arviz/tests/base_tests/test_stats_ecdf_utils.py +++ b/arviz/tests/base_tests/test_stats_ecdf_utils.py @@ -11,7 +11,7 @@ ) try: - import numba # pylint: disable=unused-import + import numba # pylint: disable=unused-import numba_options = [True, False] except ImportError: