Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ECDF code #2311

Merged
merged 36 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
a83e950
Remove redundant call to compute_ecdf
sethaxen Feb 2, 2024
d74d0f3
Sort samples in-place
sethaxen Feb 2, 2024
622238a
Remove dead definition
sethaxen Feb 2, 2024
908162e
Remove dead variable npoints
sethaxen Feb 2, 2024
661676c
Use more descriptive variable names
sethaxen Feb 7, 2024
fd82d56
Restructure to reduce code redundancy
sethaxen Feb 7, 2024
3612cdc
Use consistent argument order
sethaxen Feb 9, 2024
3348864
Standardize more variable names
sethaxen Feb 9, 2024
c774f37
Split adjust_gamma into several functions
sethaxen Feb 9, 2024
c9a9003
Define in terms of pointwise prob instead of gamma
sethaxen Feb 9, 2024
ce696dd
Refactor to use more informative function names
sethaxen Feb 9, 2024
e14bc08
Use ndraws for consistency
sethaxen Feb 9, 2024
5727fc1
Add utility function to simulate an ECDF
sethaxen Feb 9, 2024
e3bc1ab
Add type annotations
sethaxen Feb 9, 2024
0011182
Rearrange code
sethaxen Feb 9, 2024
780d395
Require CDF take vector inputs
sethaxen Feb 9, 2024
4e45374
Add notes for the future
sethaxen Feb 9, 2024
d8a57a3
Define ecdf_confidence_band
sethaxen Feb 10, 2024
57c2355
Remove rvs option
sethaxen Feb 10, 2024
c4131b7
Move ecdf utility functions to stats module
sethaxen Feb 10, 2024
573306c
Make some functions internal
sethaxen Feb 10, 2024
03c92e7
Use binom.interval
sethaxen Feb 10, 2024
7c3efe1
Forward prob keeyword
sethaxen Feb 10, 2024
52b6c24
Mark function as private
sethaxen Feb 10, 2024
1882359
Test ecdf helper functions
sethaxen Feb 10, 2024
bc7cea0
Revert "Require CDF take vector inputs"
sethaxen Feb 10, 2024
ab5b65b
Map cdf over array
sethaxen Feb 10, 2024
1016956
Fix formatting issues
sethaxen Feb 10, 2024
5c02322
Add changelog entry
sethaxen Feb 10, 2024
3f21481
Apply suggestions from code review
sethaxen Feb 14, 2024
1f13657
Use Any as type for `random_state`
sethaxen Feb 14, 2024
80327d5
Update documentation of rvs and random_state
sethaxen Feb 14, 2024
a77942b
Use `default_rng` in tests
sethaxen Feb 14, 2024
6cbcfe7
Document requirements of cdf function
sethaxen Feb 21, 2024
e01f5c9
Assume cdf accepts array inputs
sethaxen Feb 21, 2024
090af26
try making CI happy
OriolAbril Feb 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

### Maintenance and fixes

- Refactor ECDF code ([2311](https://github.com/arviz-devs/arviz/pull/2311))

### Deprecation

### Documentation
Expand Down
148 changes: 36 additions & 112 deletions arviz/plots/ecdfplot.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Plot ecdf or ecdf-difference plot with confidence bands."""
import numpy as np
from scipy.stats import uniform, binom
from scipy.stats import uniform

from ..rcparams import rcParams
from ..stats.ecdf_utils import compute_ecdf, ecdf_confidence_band, _get_ecdf_points
from .plot_utils import get_plotting_function


Expand All @@ -26,7 +27,7 @@ def plot_ecdf(
show=None,
backend=None,
backend_kwargs=None,
**kwargs
**kwargs,
):
r"""Plot ECDF or ECDF-Difference Plot with Confidence bands.

Expand Down Expand Up @@ -180,75 +181,47 @@ def plot_ecdf(
values = np.ravel(values)
values.sort()

## This block computes gamma and uses it to get the upper and lower confidence bands
## Here we check if we want confidence bands or not
if confidence_bands:
## If plotting PIT then we find the PIT values of sample.
## Basically here we generate the evaluation points(x) and find the PIT values.
## z is the evaluation point for our uniform distribution in compute_gamma()
if pit:
x = np.linspace(1 / npoints, 1, npoints)
z = x
## Finding PIT for our sample
probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2)
else:
## If not PIT use sample for plots and for evaluation points(x) use equally spaced
## points between minimum and maximum of sample
## For z we have used cdf(x)
x = np.linspace(values[0], values[-1], npoints)
z = cdf(x) if cdf else compute_ecdf(values2, x)
probs = values

n = len(values) # number of samples
## Computing gamma
gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr)
## Using gamma to get the confidence intervals
lower, higher = get_lims(gamma, n, z)

## This block is for whether to plot ECDF or ECDF-difference
if not difference:
## We store the coordinates of our ecdf in x_coord, y_coord
x_coord, y_coord = get_ecdf_points(x, probs, difference)
if pit:
eval_points = np.linspace(1 / npoints, 1, npoints)
if cdf:
sample = np.apply_along_axis(cdf, 0, values)
sethaxen marked this conversation as resolved.
Show resolved Hide resolved
else:
## Here we subtract the ecdf value as here we are plotting the ECDF-difference
x_coord, y_coord = get_ecdf_points(x, probs, difference)
for i, x_i in enumerate(x):
y_coord[i] = y_coord[i] - (
x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
)

## Similarly we subtract from the upper and lower bounds
if pit:
lower = lower - x
higher = higher - x
else:
lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x))
higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x))

sample = compute_ecdf(values2, values) / len(values2)
cdf_at_eval_points = eval_points
rvs = uniform(0, 1).rvs
else:
if pit:
x = np.linspace(1 / npoints, 1, npoints)
probs = cdf(values)
eval_points = np.linspace(values[0], values[-1], npoints)
sample = values
if confidence_bands or difference:
if cdf:
cdf_at_eval_points = np.apply_along_axis(cdf, 0, eval_points)
else:
cdf_at_eval_points = compute_ecdf(values2, eval_points)
else:
x = np.linspace(values[0], values[-1], npoints)
probs = values
cdf_at_eval_points = np.zeros_like(eval_points)
rvs = None

x_coord, y_coord = _get_ecdf_points(sample, eval_points, difference)

if difference:
y_coord -= cdf_at_eval_points

if confidence_bands:
ndraws = len(values)
band_kwargs = {"prob": 1 - fpr, "num_trials": num_trials, "rvs": rvs, "random_state": None}
band_kwargs["method"] = "pointwise" if pointwise else "simulated"
lower, higher = ecdf_confidence_band(ndraws, eval_points, cdf_at_eval_points, **band_kwargs)

if difference:
lower -= cdf_at_eval_points
higher -= cdf_at_eval_points
else:
lower, higher = None, None
## This block is for whether to plot ECDF or ECDF-difference
if not difference:
x_coord, y_coord = get_ecdf_points(x, probs, difference)
else:
## Here we subtract the ecdf value as here we are plotting the ECDF-difference
x_coord, y_coord = get_ecdf_points(x, probs, difference)
for i, x_i in enumerate(x):
y_coord[i] = y_coord[i] - (
x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
)

ecdf_plot_args = dict(
x_coord=x_coord,
y_coord=y_coord,
x_bands=x,
x_bands=eval_points,
lower=lower,
higher=higher,
confidence_bands=confidence_bands,
Expand All @@ -260,7 +233,7 @@ def plot_ecdf(
ax=ax,
show=show,
backend_kwargs=backend_kwargs,
**kwargs
**kwargs,
)

if backend is None:
Expand All @@ -271,52 +244,3 @@ def plot_ecdf(
ax = plot(**ecdf_plot_args)

return ax


def compute_ecdf(sample, z):
"""Compute ECDF.

This function computes the ecdf value at the evaluation point
or a sorted set of evaluation points.
"""
return np.searchsorted(sample, z, side="right") / len(sample)


def get_ecdf_points(x, probs, difference):
"""Compute the coordinates for the ecdf points using compute_ecdf."""
y = compute_ecdf(probs, x)

if not difference:
x = np.insert(x, 0, x[0])
y = np.insert(y, 0, 0)
return x, y


def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05):
"""Compute gamma for confidence interval calculation.

This function simulates an adjusted value of gamma to account for multiplicity
when forming an 1-fpr level confidence envelope for the ECDF of a sample.
"""
if npoints is None:
npoints = n
gamma = []
for _ in range(num_trials):
unif_samples = uniform.rvs(0, 1, n)
unif_samples = np.sort(unif_samples)
gamma_m = 1000
## Can compute ecdf for all the z together or one at a time.
f_z = compute_ecdf(unif_samples, z)
f_z = compute_ecdf(unif_samples, z)
gamma_m = 2 * min(
np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z))
)
gamma.append(gamma_m)
return np.quantile(gamma, fpr)


def get_lims(gamma, n, z):
"""Compute the simultaneous 1 - fpr level confidence bands."""
lower = binom.ppf(gamma / 2, n, z)
upper = binom.ppf(1 - gamma / 2, n, z)
return lower / n, upper / n
163 changes: 163 additions & 0 deletions arviz/stats/ecdf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""Functions for evaluating ECDFs and their confidence bands."""
from typing import Callable, Optional, Tuple
import warnings

import numpy as np
from scipy.stats import uniform, binom


def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray:
"""Compute ECDF of the sorted `sample` at the evaluation points."""
return np.searchsorted(sample, eval_points, side="right") / len(sample)


def _get_ecdf_points(
sample: np.ndarray, eval_points: np.ndarray, difference: bool
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the coordinates for the ecdf points using compute_ecdf."""
x = eval_points
y = compute_ecdf(sample, eval_points)

if not difference and y[0] > 0:
x = np.insert(x, 0, x[0])
y = np.insert(y, 0, 0)
return x, y


def _simulate_ecdf(
ndraws: int,
eval_points: np.ndarray,
rvs: Callable[[int, Optional[np.random.RandomState]], np.ndarray],
random_state: Optional[np.random.RandomState] = None,
sethaxen marked this conversation as resolved.
Show resolved Hide resolved
) -> np.ndarray:
"""Simulate ECDF at the `eval_points` using the given random variable sampler"""
sample = rvs(ndraws, random_state=random_state)
sample.sort()
return compute_ecdf(sample, eval_points)


def _fit_pointwise_band_probability(
ndraws: int,
ecdf_at_eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
) -> float:
"""Compute the smallest marginal probability of a pointwise confidence band that
contains the ECDF."""
ecdf_scaled = (ndraws * ecdf_at_eval_points).astype(int)
prob_lower_tail = np.amin(binom.cdf(ecdf_scaled, ndraws, cdf_at_eval_points))
prob_upper_tail = np.amin(binom.sf(ecdf_scaled - 1, ndraws, cdf_at_eval_points))
prob_pointwise = 1 - 2 * min(prob_lower_tail, prob_upper_tail)
return prob_pointwise


def _get_pointwise_confidence_band(
prob: float, ndraws: int, cdf_at_eval_points: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the `prob`-level pointwise confidence band."""
count_lower, count_upper = binom.interval(prob, ndraws, cdf_at_eval_points)
prob_lower = count_lower / ndraws
prob_upper = count_upper / ndraws
return prob_lower, prob_upper


def ecdf_confidence_band(
ndraws: int,
eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
method="simulated",
**kwargs,
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the `prob`-level confidence band for the ECDF.

Arguments
---------
ndraws : int
Number of samples in the original dataset.
eval_points : np.ndarray
Points at which the ECDF is evaluated. If these are dependent on the sample
values, simultaneous confidence bands may not be correctly calibrated.
cdf_at_eval_points : np.ndarray
CDF values at the evaluation points.
prob : float, default 0.95
The target probability that a true ECDF lies within the confidence band.
method : string, default "simulated"
The method used to compute the confidence band. Valid options are:
- "pointwise": Compute the pointwise (i.e. marginal) confidence band.
- "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band.
`rvs` must be provided.
rvs: callable, optional
A function with signature
`rvs(n: int, random_state: Optional[np.random.RandomState]) -> np.ndarray`
that returns `n` samples from the distribution of the original dataset.
Required if `method` is "simulated" and variable is discrete.
sethaxen marked this conversation as resolved.
Show resolved Hide resolved
num_trials : int, default 1000
The number of random ECDFs to generate for constructing simultaneous confidence bands
(if `method` is "simulated").
random_state : np.random.RandomState, optional
Random state to be used if `method` is "simulated".

Returns
-------
prob_lower : np.ndarray
Lower confidence band for the ECDF at the evaluation points.
prob_upper : np.ndarray
Upper confidence band for the ECDF at the evaluation points.
"""
if not 0 < prob < 1:
raise ValueError(f"Invalid value for `prob`. Expected 0 < prob < 1, but got {prob}.")

if method == "pointwise":
prob_pointwise = prob
elif method == "simulated":
prob_pointwise = _simulate_simultaneous_ecdf_band_probability(
ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
)
else:
raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.")

prob_lower, prob_upper = _get_pointwise_confidence_band(
prob_pointwise, ndraws, cdf_at_eval_points
)

return prob_lower, prob_upper


def _simulate_simultaneous_ecdf_band_probability(
ndraws: int,
eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
rvs: Optional[Callable[[int, Optional[np.random.RandomState]], np.ndarray]] = None,
num_trials: int = 1000,
random_state: Optional[np.random.RandomState] = None,
) -> float:
"""Estimate probability for simultaneous confidence band using simulation.

This function simulates the pointwise probability needed to construct pointwise
confidence bands that form a `prob`-level confidence envelope for the ECDF
of a sample.
"""
if rvs is None:
warnings.warn(
"Assuming variable is continuous for calibration of pointwise bands. "
"If the variable is discrete, specify random variable sampler `rvs`.",
UserWarning,
)
# if variable continuous, we can calibrate the confidence band using a uniform
# distribution
rvs = uniform(0, 1).rvs
eval_points_sim = cdf_at_eval_points
else:
eval_points_sim = eval_points

probs_pointwise = []
for _ in range(num_trials):
ecdf_at_eval_points = _simulate_ecdf(
ndraws, eval_points_sim, rvs, random_state=random_state
)
prob_pointwise = _fit_pointwise_band_probability(
ndraws, ecdf_at_eval_points, cdf_at_eval_points
)
probs_pointwise.append(prob_pointwise)
sethaxen marked this conversation as resolved.
Show resolved Hide resolved
return np.quantile(probs_pointwise, prob)
Loading
Loading