Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimized simultaneous ECDF bands #2368

Merged
merged 17 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## v0.x.x Unreleased

### New features
- Add optimized simultaneous ECDF confidence bands ([2368](https://github.com/arviz-devs/arviz/pull/2368))

### Maintenance and fixes

Expand Down
24 changes: 16 additions & 8 deletions arviz/plots/ecdfplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def plot_ecdf(
- False: No confidence bands are plotted (default).
- True: Plot bands computed with the default algorithm (subject to change)
- "pointwise": Compute the pointwise (i.e. marginal) confidence band.
- "optimized": Use optimization to estimate a simultaneous confidence band.
- "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence
band.

Expand Down Expand Up @@ -216,8 +217,7 @@ def plot_ecdf(
>>> pit_vals = distribution.cdf(sample)
>>> uniform_dist = uniform(0, 1)
>>> az.plot_ecdf(
>>> pit_vals, cdf=uniform_dist.cdf,
>>> rvs=uniform_dist.rvs, confidence_bands=True
>>> pit_vals, cdf=uniform_dist.cdf, confidence_bands=True,
>>> )

Plot an ECDF-difference plot of PIT values.
Expand All @@ -226,8 +226,8 @@ def plot_ecdf(
:context: close-figs

>>> az.plot_ecdf(
>>> pit_vals, cdf = uniform_dist.cdf, rvs = uniform_dist.rvs,
>>> confidence_bands = True, difference = True
>>> pit_vals, cdf = uniform_dist.cdf, confidence_bands = True,
>>> difference = True
>>> )
"""
if confidence_bands is True:
Expand All @@ -238,9 +238,12 @@ def plot_ecdf(
)
confidence_bands = "pointwise"
else:
confidence_bands = "simulated"
elif confidence_bands == "simulated" and pointwise:
raise ValueError("Cannot specify both `confidence_bands='simulated'` and `pointwise=True`")
confidence_bands = "auto"
# if pointwise specified, confidence_bands must be a bool or 'pointwise'
elif confidence_bands not in [False, "pointwise"] and pointwise:
raise ValueError(
f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`"
)

if fpr is not None:
warnings.warn(
Expand Down Expand Up @@ -298,7 +301,7 @@ def plot_ecdf(
"`eval_points` explicitly.",
BehaviourChangeWarning,
)
if confidence_bands == "simulated":
if confidence_bands in ["optimized", "simulated"]:
warnings.warn(
"For simultaneous bands to be correctly calibrated, specify `eval_points` "
"independent of the `values`"
Expand All @@ -319,6 +322,11 @@ def plot_ecdf(

if confidence_bands:
ndraws = len(values)
if confidence_bands == "auto":
if ndraws < 200 or num_trials >= 250 * np.sqrt(ndraws):
confidence_bands = "optimized"
else:
confidence_bands = "simulated"
x_bands = eval_points
lower, higher = ecdf_confidence_band(
ndraws,
Expand Down
159 changes: 157 additions & 2 deletions arviz/stats/ecdf_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
"""Functions for evaluating ECDFs and their confidence bands."""

import math
from typing import Any, Callable, Optional, Tuple
import warnings

import numpy as np
from scipy.stats import uniform, binom
from scipy.optimize import minimize_scalar

try:
from numba import jit, vectorize
except ImportError:

def jit(*args, **kwargs): # pylint: disable=unused-argument
return lambda f: f

def vectorize(*args, **kwargs): # pylint: disable=unused-argument
return lambda f: f


from ..utils import Numba


def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray:
Expand Down Expand Up @@ -73,7 +88,7 @@
eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
method="simulated",
method="optimized",
**kwargs,
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the `prob`-level confidence band for the ECDF.
Expand All @@ -92,6 +107,7 @@
method : string, default "simulated"
The method used to compute the confidence band. Valid options are:
- "pointwise": Compute the pointwise (i.e. marginal) confidence band.
- "optimized": Use optimization to estimate a simultaneous confidence band.
- "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band.
`rvs` must be provided.
rvs: callable, optional
Expand All @@ -115,12 +131,18 @@

if method == "pointwise":
prob_pointwise = prob
elif method == "optimized":
prob_pointwise = _optimize_simultaneous_ecdf_band_probability(
ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
)
elif method == "simulated":
prob_pointwise = _simulate_simultaneous_ecdf_band_probability(
ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
)
else:
raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.")
raise ValueError(

Check warning on line 143 in arviz/stats/ecdf_utils.py

View check run for this annotation

Codecov / codecov/patch

arviz/stats/ecdf_utils.py#L143

Added line #L143 was not covered by tests
f"Unknown method {method}. Valid options are 'pointwise', 'optimized', or 'simulated'."
)

prob_lower, prob_upper = _get_pointwise_confidence_band(
prob_pointwise, ndraws, cdf_at_eval_points
Expand All @@ -129,6 +151,139 @@
return prob_lower, prob_upper


def _update_ecdf_band_interior_probabilities(
prob_left: np.ndarray,
interval_left: np.ndarray,
interval_right: np.ndarray,
p: float,
ndraws: int,
) -> np.ndarray:
"""Update the probability that an ECDF has been within the envelope including at the current
point.

Arguments
---------
prob_left : np.ndarray
For each point in the interior at the previous point, the joint probability that it and all
points before are in the interior.
interval_left : np.ndarray
The set of points in the interior at the previous point.
interval_right : np.ndarray
The set of points in the interior at the current point.
p : float
The probability of any given point found between the previous point and the current one.
ndraws : int
Number of draws in the original dataset.

Returns
-------
prob_right : np.ndarray
For each point in the interior at the current point, the joint probability that it and all
previous points are in the interior.
"""
interval_left = interval_left[:, np.newaxis]
prob_conditional = binom.pmf(interval_right, ndraws - interval_left, p, loc=interval_left)
prob_right = prob_left.dot(prob_conditional)
return prob_right


@vectorize(["float64(int64, int64, float64, int64)"])
def _binom_pmf(k, n, p, loc):
k -= loc
if k < 0 or k > n:
return 0.0
if p == 0:
return 1.0 if k == 0 else 0.0
if p == 1:
return 1.0 if k == n else 0.0
if k == 0:
return (1 - p) ** n
if k == n:
return p**n
lbinom = math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1)
return np.exp(lbinom + k * np.log(p) + (n - k) * np.log1p(-p))

Check warning on line 204 in arviz/stats/ecdf_utils.py

View check run for this annotation

Codecov / codecov/patch

arviz/stats/ecdf_utils.py#L192-L204

Added lines #L192 - L204 were not covered by tests


@jit(nopython=True)
def _update_ecdf_band_interior_probabilities_numba(
prob_left: np.ndarray,
interval_left: np.ndarray,
interval_right: np.ndarray,
p: float,
ndraws: int,
) -> np.ndarray:
interval_left = interval_left[:, np.newaxis]
prob_conditional = _binom_pmf(interval_right, ndraws - interval_left, p, interval_left)
prob_right = prob_left.dot(prob_conditional)
return prob_right


def _ecdf_band_interior_probability(prob_between_points, ndraws, lower_count, upper_count):
interval_left = np.arange(1)
prob_interior = np.ones(1)
for i in range(prob_between_points.shape[0]):
interval_right = np.arange(lower_count[i], upper_count[i])
prob_interior = _update_ecdf_band_interior_probabilities(
prob_interior, interval_left, interval_right, prob_between_points[i], ndraws
)
interval_left = interval_right
return prob_interior.sum()


@jit(nopython=True)
def _ecdf_band_interior_probability_numba(prob_between_points, ndraws, lower_count, upper_count):
interval_left = np.arange(1)
prob_interior = np.ones(1)
for i in range(prob_between_points.shape[0]):
interval_right = np.arange(lower_count[i], upper_count[i])
prob_interior = _update_ecdf_band_interior_probabilities_numba(
prob_interior, interval_left, interval_right, prob_between_points[i], ndraws
)
interval_left = interval_right
return prob_interior.sum()


def _ecdf_band_optimization_objective(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should ideally be jitted with numba if available for a likely significant speed-up. Currently the problem is that the functions it calls use scipy.stats.binom.interval and scipy.stats.binom.pmf, which are not JIT-able. If we reimplement these functions here, then we could probably get a significant speed-up.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interval calls ppf, which defers to Boost's implementation, which in turn uses TOMS Algorithm 748 to find a root in an interval: https://doi.org/10.1145/210089.210111, so reimplementing that to be JIT-able would be the bottleneck. Which seems like it could be a lot of work.

Copy link
Member Author

@sethaxen sethaxen Aug 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah wait, since interval is called outside the for loop and only pmf is called within, this could be simple.

@conditional_jit doesn't seem to like it if I call another conditionally-jitted function within the first conditionally-jitted function, e.g.

E           numba.core.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
E           Untyped global name '_update_ecdf_band_interior_probabilities': Cannot determine Numba type of <class 'arviz.utils.maybe_numba_fn'>

Is there a way to make this work?

prob_pointwise: float,
cdf_at_eval_points: np.ndarray,
ndraws: int,
prob_target: float,
) -> float:
"""Objective function for optimizing the simultaneous confidence band probability."""
lower, upper = _get_pointwise_confidence_band(prob_pointwise, ndraws, cdf_at_eval_points)
lower_count = (lower * ndraws).astype(int)
upper_count = (upper * ndraws).astype(int) + 1
cdf_with_zero = np.insert(cdf_at_eval_points[:-1], 0, 0)
prob_between_points = (cdf_at_eval_points - cdf_with_zero) / (1 - cdf_with_zero)
if Numba.numba_flag:
prob_interior = _ecdf_band_interior_probability_numba(
prob_between_points, ndraws, lower_count, upper_count
)
else:
prob_interior = _ecdf_band_interior_probability(
prob_between_points, ndraws, lower_count, upper_count
)
return abs(prob_interior - prob_target)


def _optimize_simultaneous_ecdf_band_probability(
ndraws: int,
eval_points: np.ndarray, # pylint: disable=unused-argument
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
**kwargs, # pylint: disable=unused-argument
):
"""Estimate probability for simultaneous confidence band using optimization.

This function simulates the pointwise probability needed to construct pointwise confidence bands
that form a `prob`-level confidence envelope for the ECDF of a sample.
"""
cdf_at_eval_points = np.unique(cdf_at_eval_points)
objective = lambda p: _ecdf_band_optimization_objective(p, cdf_at_eval_points, ndraws, prob)
prob_pointwise = minimize_scalar(objective, bounds=(prob, 1), method="bounded").x
return prob_pointwise


def _simulate_simultaneous_ecdf_band_probability(
ndraws: int,
eval_points: np.ndarray,
Expand Down
9 changes: 6 additions & 3 deletions arviz/tests/base_tests/test_plots_matplotlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1285,10 +1285,11 @@ def test_plot_ecdf_eval_points():
assert axes is not None


@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "simulated"])
def test_plot_ecdf_confidence_bands(confidence_bands):
@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "optimized", "simulated"])
@pytest.mark.parametrize("ndraws", [100, 10_000])
def test_plot_ecdf_confidence_bands(confidence_bands, ndraws):
"""Check that all confidence_bands values correctly accepted"""
data = np.random.randn(4, 1000)
data = np.random.randn(4, ndraws // 4)
axes = plot_ecdf(data, confidence_bands=confidence_bands, cdf=norm(0, 1).cdf)
assert axes is not None

Expand Down Expand Up @@ -1326,6 +1327,8 @@ def test_plot_ecdf_error():
# contradictory confidence band types
with pytest.raises(ValueError):
plot_ecdf(data, cdf=dist.cdf, confidence_bands="simulated", pointwise=True)
with pytest.raises(ValueError):
plot_ecdf(data, cdf=dist.cdf, confidence_bands="optimized", pointwise=True)
plot_ecdf(data, cdf=dist.cdf, confidence_bands=True, pointwise=True)
plot_ecdf(data, cdf=dist.cdf, confidence_bands="pointwise")

Expand Down
17 changes: 15 additions & 2 deletions arviz/tests/base_tests/test_stats_ecdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
_get_pointwise_confidence_band,
)

try:
import numba # pylint: disable=unused-import

numba_options = [True, False]
except ImportError:
numba_options = [False]


def test_compute_ecdf():
"""Test compute_ecdf function."""
Expand Down Expand Up @@ -109,9 +116,15 @@ def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, see
ids=["continuous", "continuous default rvs", "discrete"],
)
@pytest.mark.parametrize("ndraws", [10_000])
@pytest.mark.parametrize("method", ["pointwise", "simulated"])
def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57):
@pytest.mark.parametrize("method", ["pointwise", "optimized", "simulated"])
@pytest.mark.parametrize("use_numba", numba_options)
def test_ecdf_confidence_band(
dist, rvs, prob, ndraws, method, use_numba, num_trials=1_000, seed=57
):
"""Test test_ecdf_confidence_band."""
if use_numba and method != "optimized":
pytest.skip("Numba only used in optimized method")

eval_points = np.linspace(*dist.interval(0.99), 10)
cdf_at_eval_points = dist.cdf(eval_points)
random_state = np.random.default_rng(seed)
Expand Down