Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SMC-ABC add distance, refactor and update notebook #3996

Merged
merged 10 commits into from
Jul 16, 2020
339 changes: 138 additions & 201 deletions docs/source/notebooks/SMC-ABC_Lotka-Volterra_example.ipynb

Large diffs are not rendered by default.

100 changes: 89 additions & 11 deletions pymc3/distributions/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,77 @@
# limitations under the License.

import numpy as np
from .distribution import NoDistribution
from .distribution import NoDistribution, draw_values

__all__ = ["Simulator"]


class Simulator(NoDistribution):
def __init__(self, function, *args, params=None, **kwargs):
def __init__(
self,
function,
*args,
params=None,
distance="gaussian_kernel",
sum_stat="identity",
epsilon=1,
**kwargs,
):
"""
This class stores a function defined by the user in python language.

function: function
Simulation function defined by the user.
params: list
Parameters passed to function.
distance: str or callable
Distance functions. Available options are "gaussian_kernel" (default), "wasserstein",
"energy" or a user defined function
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"energy" or a user defined function
"energy" or a user defined function that takes epsilon (a scaler), and (the summary statistics of) observed_data, and simulated_data as input.

``gaussian_kernel`` :math: `\sum \left(-0.5 \left(\frac{xo - xs}{\epsilon}\right)^2\right)`
``wasserstein`` :math: `\frac{1}{n} \sum{\left(\frac{|xo - xs|}{\epsilon}\right)}`
``energy`` :math: `\sqrt{2} \sqrt{\frac{1}{n} \sum \left(\frac{|xo - xs|}{\epsilon}\right)^2}`
For the wasserstein and energy distances the observed data xo and simulated data xs
are internally sorted (i.e. the sum_stat is "sort").
sum_stat: str or callable
Summary statistics. Available options are ``indentity``, ``sort``, ``mean``, ``median``.
If a callable is based it should return a number or a 1d numpy array.
epsilon: float
Standard deviation of the gaussian_kernel.
*args and **kwargs:
Arguments and keywords arguments that the function takes.
"""

self.function = function
self.params = params
observed = self.data
self.epsilon = epsilon

if distance == "gaussian_kernel":
self.distance = gaussian_kernel
elif distance == "wasserstein":
self.distance = wasserstein
sum_stat = "sort"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we do a assertion here or a warning that sum_stat need to be "sort" for wasserstein and energy?

elif distance == "energy":
self.distance = energy
sum_stat = "sort"
elif hasattr(distance, "__call__"):
self.distance = distance
else:
raise ValueError(f"The distance metric {distance} is not implemented")

if sum_stat == "identity":
self.sum_stat = identity
elif sum_stat == "sort":
self.sum_stat = np.sort
elif sum_stat == "mean":
self.sum_stat = np.mean
elif sum_stat == "median":
self.sum_stat = np.median
elif hasattr(sum_stat, "__call__"):
self.sum_stat = sum_stat
else:
raise ValueError(f"The summary statistics {sum_stat} is not implemented")

super().__init__(shape=np.prod(observed.shape), dtype=observed.dtype, *args, **kwargs)

def random(self, point=None, size=None):
Expand All @@ -51,16 +101,44 @@ def random(self, point=None, size=None):
-------
array
"""

raise NotImplementedError("Not implemented yet")
params = draw_values([*self.params], point=point, size=size)
if size is None:
return self.function(*params)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if self.function is a theano function it will return a tensor - maybe we can push the draw_values call to the end and return that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.function is intended to be a Python function not a Theano one.

else:
return np.array([self.function(*params) for _ in range(size)])

def _repr_latex_(self, name=None, dist=None):
if dist is None:
dist = self
name = r"\text{%s}" % name
function = dist.function
params = dist.parameters
sum_stat = dist.sum_stat
return r"${} \sim \text{{Simulator}}(\mathit{{function}}={},~\mathit{{parameters}}={},~\mathit{{summary statistics}}={})$".format(
name, function, params, sum_stat
)
name = name
function = dist.function.__name__
params = ", ".join([var.name for var in dist.params])
sum_stat = self.sum_stat.__name__ if hasattr(self.sum_stat, "__call__") else self.sum_stat
distance = self.distance.__name__
return f"$\\text{{{name}}} \sim \\text{{Simulator}}(\\text{{{function}}}({params}), \\text{{{distance}}}, \\text{{{sum_stat}}})$"


def identity(x):
"""Identity function, used as a summary statistics."""
return x


def gaussian_kernel(epsilon, obs_data, sim_data):
"""gaussian distance function"""
return np.sum(-0.5 * ((obs_data - sim_data) / epsilon) ** 2)


def wasserstein(epsilon, obs_data, sim_data):
"""Wasserstein distance function.

We are assuming obs_data and sim_data are already sorted!
"""
return np.mean(np.abs((obs_data - sim_data) / epsilon))


def energy(epsilon, obs_data, sim_data):
"""Energy distance function.

We are assuming obs_data and sim_data are already sorted!
"""
return 1.4142 * np.mean(((obs_data - sim_data) / epsilon) ** 2) ** 0.5
53 changes: 29 additions & 24 deletions pymc3/smc/sample_smc.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ def sample_smc(
tune_steps=True,
p_acc_rate=0.99,
threshold=0.5,
epsilon=1.0,
dist_func="gaussian_kernel",
sum_stat="identity",
save_sim_data=False,
model=None,
random_seed=-1,
parallel=False,
Expand Down Expand Up @@ -74,13 +72,9 @@ def sample_smc(
Determines the change of beta from stage to stage, i.e.indirectly the number of stages,
the higher the value of `threshold` the higher the number of stages. Defaults to 0.5.
It should be between 0 and 1.
epsilon: float
Standard deviation of the gaussian pseudo likelihood. Only works with `kernel = ABC`
dist_func: str
Distance function. The only available option is ``gaussian_kernel``
sum_stat: str or callable
Summary statistics. Available options are ``indentity``, ``sorted``, ``mean``, ``median``.
If a callable is based it should return a number or a 1d numpy array.
save_sim_data : bool
Whether or not to save the simulated data. This parameters only work with the ABC kernel.
The stored data corresponds to the posterior predictive distribution.
model: Model (optional if in ``with`` context)).
random_seed: int
random seed
Expand Down Expand Up @@ -148,8 +142,15 @@ def sample_smc(

if chains is None:
chains = max(2, cores)
elif chains == 1:
cores = 1

_log.info(f"Multiprocess sampling ({chains} chains in {cores} jobs)")
_log.info(
(
f"Multiprocess sampling ({chains} chain{'s' if chains > 1 else ''} "
f"in {cores} job{'s' if cores > 1 else ''})"
)
)

if random_seed == -1:
random_seed = None
Expand All @@ -175,14 +176,12 @@ def sample_smc(
tune_steps,
p_acc_rate,
threshold,
epsilon,
dist_func,
sum_stat,
save_sim_data,
model,
)

t1 = time.time()
if parallel:
if parallel and chains > 1:
loggers = [_log] + [None] * (chains - 1)
pool = mp.Pool(cores)
results = pool.starmap(
Expand All @@ -196,7 +195,7 @@ def sample_smc(
for i in range(chains):
results.append((sample_smc_int(*params, random_seed[i], i, _log)))

traces, log_marginal_likelihoods, betas, accept_ratios, nsteps = zip(*results)
traces, sim_data, log_marginal_likelihoods, betas, accept_ratios, nsteps = zip(*results)
trace = MultiTrace(traces)
trace.report._n_draws = draws
trace.report._n_tune = 0
Expand All @@ -206,7 +205,10 @@ def sample_smc(
trace.report.accept_ratios = accept_ratios
trace.report.nsteps = nsteps

return trace
if save_sim_data:
return trace, {modelcontext(model).observed_RVs[0].name: np.array(sim_data)}
else:
return trace


def sample_smc_int(
Expand All @@ -217,9 +219,7 @@ def sample_smc_int(
tune_steps,
p_acc_rate,
threshold,
epsilon,
dist_func,
sum_stat,
save_sim_data,
model,
random_seed,
chain,
Expand All @@ -234,9 +234,7 @@ def sample_smc_int(
tune_steps=tune_steps,
p_acc_rate=p_acc_rate,
threshold=threshold,
epsilon=epsilon,
dist_func=dist_func,
sum_stat=sum_stat,
save_sim_data=save_sim_data,
model=model,
random_seed=random_seed,
chain=chain,
Expand All @@ -262,4 +260,11 @@ def sample_smc_int(
accept_ratios.append(smc.acc_rate)
nsteps.append(smc.n_steps)

return smc.posterior_to_trace(), smc.log_marginal_likelihood, betas, accept_ratios, nsteps
return (
smc.posterior_to_trace(),
smc.sim_data,
smc.log_marginal_likelihood,
betas,
accept_ratios,
nsteps,
)
Loading