pymc-devs · aloctavodia · Jul 16, 2020 · Jul 3, 2020 · Jul 6, 2020 · Jul 6, 2020
diff --git a/docs/source/notebooks/SMC-ABC_Lotka-Volterra_example.ipynb b/docs/source/notebooks/SMC-ABC_Lotka-Volterra_example.ipynb
diff --git a/pymc3/distributions/simulator.py b/pymc3/distributions/simulator.py
@@ -13,27 +13,77 @@
 #   limitations under the License.
 
 import numpy as np
-from .distribution import NoDistribution
+from .distribution import NoDistribution, draw_values
 
 __all__ = ["Simulator"]
 
 
 class Simulator(NoDistribution):
-    def __init__(self, function, *args, params=None, **kwargs):
+    def __init__(
+        self,
+        function,
+        *args,
+        params=None,
+        distance="gaussian_kernel",
+        sum_stat="identity",
+        epsilon=1,
+        **kwargs,
+    ):
         """
         This class stores a function defined by the user in python language.
 
         function: function
             Simulation function defined by the user.
         params: list
             Parameters passed to function.
+        distance: str or callable
+            Distance functions. Available options are "gaussian_kernel" (default), "wasserstein",
+            "energy" or a user defined function
-            "energy" or a user defined function
+            "energy" or a user defined function that takes epsilon (a scaler), and (the summary statistics of) observed_data, and simulated_data as input.
-            "energy" or a user defined function
+            "energy" or a user defined function that takes epsilon (a scaler), and (the summary statistics of) observed_data, and simulated_data as input.
+            ``gaussian_kernel`` :math: `\sum \left(-0.5  \left(\frac{xo - xs}{\epsilon}\right)^2\right)`
+            ``wasserstein`` :math: `\frac{1}{n} \sum{\left(\frac{|xo - xs|}{\epsilon}\right)}`
+            ``energy`` :math: `\sqrt{2} \sqrt{\frac{1}{n} \sum \left(\frac{|xo - xs|}{\epsilon}\right)^2}`
+            For the wasserstein and energy distances the observed data xo and simulated data xs
+            are internally sorted (i.e. the sum_stat is "sort").
+        sum_stat: str or callable
+            Summary statistics. Available options are ``indentity``, ``sort``, ``mean``, ``median``.
+            If a callable is based it should return a number or a 1d numpy array.
+        epsilon: float
+            Standard deviation of the gaussian_kernel.
         *args and **kwargs: 
             Arguments and keywords arguments that the function takes.
         """
 
         self.function = function
         self.params = params
         observed = self.data
+        self.epsilon = epsilon
+
+        if distance == "gaussian_kernel":
+            self.distance = gaussian_kernel
+        elif distance == "wasserstein":
+            self.distance = wasserstein
+            sum_stat = "sort"
+        elif distance == "energy":
+            self.distance = energy
+            sum_stat = "sort"
+        elif hasattr(distance, "__call__"):
+            self.distance = distance
+        else:
+            raise ValueError(f"The distance metric {distance} is not implemented")
+
+        if sum_stat == "identity":
+            self.sum_stat = identity
+        elif sum_stat == "sort":
+            self.sum_stat = np.sort
+        elif sum_stat == "mean":
+            self.sum_stat = np.mean
+        elif sum_stat == "median":
+            self.sum_stat = np.median
+        elif hasattr(sum_stat, "__call__"):
+            self.sum_stat = sum_stat
+        else:
+            raise ValueError(f"The summary statistics {sum_stat} is not implemented")
+
         super().__init__(shape=np.prod(observed.shape), dtype=observed.dtype, *args, **kwargs)
 
     def random(self, point=None, size=None):
@@ -51,16 +101,44 @@ def random(self, point=None, size=None):
         -------
         array
         """
-
-        raise NotImplementedError("Not implemented yet")
+        params = draw_values([*self.params], point=point, size=size)
+        if size is None:
+            return self.function(*params)
+        else:
+            return np.array([self.function(*params) for _ in range(size)])
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
-        name = r"\text{%s}" % name
-        function = dist.function
-        params = dist.parameters
-        sum_stat = dist.sum_stat
-        return r"${} \sim \text{{Simulator}}(\mathit{{function}}={},~\mathit{{parameters}}={},~\mathit{{summary statistics}}={})$".format(
-            name, function, params, sum_stat
-        )
+        name = name
+        function = dist.function.__name__
+        params = ", ".join([var.name for var in dist.params])
+        sum_stat = self.sum_stat.__name__ if hasattr(self.sum_stat, "__call__") else self.sum_stat
+        distance = self.distance.__name__
+        return f"$\\text{{{name}}} \sim  \\text{{Simulator}}(\\text{{{function}}}({params}), \\text{{{distance}}}, \\text{{{sum_stat}}})$"
+
+
+def identity(x):
+    """Identity function, used as a summary statistics."""
+    return x
+
+
+def gaussian_kernel(epsilon, obs_data, sim_data):
+    """gaussian distance function"""
+    return np.sum(-0.5 * ((obs_data - sim_data) / epsilon) ** 2)
+
+
+def wasserstein(epsilon, obs_data, sim_data):
+    """Wasserstein distance function.
+
+    We are assuming obs_data and sim_data are already sorted!
+    """
+    return np.mean(np.abs((obs_data - sim_data) / epsilon))
+
+
+def energy(epsilon, obs_data, sim_data):
+    """Energy distance function.
+
+    We are assuming obs_data and sim_data are already sorted!
+    """
+    return 1.4142 * np.mean(((obs_data - sim_data) / epsilon) ** 2) ** 0.5
diff --git a/pymc3/smc/sample_smc.py b/pymc3/smc/sample_smc.py
@@ -37,9 +37,7 @@ def sample_smc(
     tune_steps=True,
     p_acc_rate=0.99,
     threshold=0.5,
-    epsilon=1.0,
-    dist_func="gaussian_kernel",
-    sum_stat="identity",
+    save_sim_data=False,
     model=None,
     random_seed=-1,
     parallel=False,
@@ -74,13 +72,9 @@ def sample_smc(
         Determines the change of beta from stage to stage, i.e.indirectly the number of stages,
         the higher the value of `threshold` the higher the number of stages. Defaults to 0.5.
         It should be between 0 and 1.
-    epsilon: float
-        Standard deviation of the gaussian pseudo likelihood. Only works with `kernel = ABC`
-    dist_func: str
-        Distance function. The only available option is ``gaussian_kernel``
-    sum_stat: str or callable
-        Summary statistics. Available options are ``indentity``, ``sorted``, ``mean``, ``median``.
-        If a callable is based it should return a number or a 1d numpy array.
+    save_sim_data : bool
+        Whether or not to save the simulated data. This parameters only work with the ABC kernel.
+        The stored data corresponds to the posterior predictive distribution.
     model: Model (optional if in ``with`` context)).
     random_seed: int
         random seed
@@ -148,8 +142,15 @@ def sample_smc(
 
     if chains is None:
         chains = max(2, cores)
+    elif chains == 1:
+        cores = 1
 
-    _log.info(f"Multiprocess sampling ({chains} chains in {cores} jobs)")
+    _log.info(
+        (
+            f"Multiprocess sampling ({chains} chain{'s' if chains > 1 else ''} "
+            f"in {cores} job{'s' if cores > 1 else ''})"
+        )
+    )
 
     if random_seed == -1:
         random_seed = None
@@ -175,14 +176,12 @@ def sample_smc(
         tune_steps,
         p_acc_rate,
         threshold,
-        epsilon,
-        dist_func,
-        sum_stat,
+        save_sim_data,
         model,
     )
 
     t1 = time.time()
-    if parallel:
+    if parallel and chains > 1:
         loggers = [_log] + [None] * (chains - 1)
         pool = mp.Pool(cores)
         results = pool.starmap(
@@ -196,7 +195,7 @@ def sample_smc(
         for i in range(chains):
             results.append((sample_smc_int(*params, random_seed[i], i, _log)))
 
-    traces, log_marginal_likelihoods, betas, accept_ratios, nsteps = zip(*results)
+    traces, sim_data, log_marginal_likelihoods, betas, accept_ratios, nsteps = zip(*results)
     trace = MultiTrace(traces)
     trace.report._n_draws = draws
     trace.report._n_tune = 0
@@ -206,7 +205,10 @@ def sample_smc(
     trace.report.accept_ratios = accept_ratios
     trace.report.nsteps = nsteps
 
-    return trace
+    if save_sim_data:
+        return trace, {modelcontext(model).observed_RVs[0].name: np.array(sim_data)}
+    else:
+        return trace
 
 
 def sample_smc_int(
@@ -217,9 +219,7 @@ def sample_smc_int(
     tune_steps,
     p_acc_rate,
     threshold,
-    epsilon,
-    dist_func,
-    sum_stat,
+    save_sim_data,
     model,
     random_seed,
     chain,
@@ -234,9 +234,7 @@ def sample_smc_int(
         tune_steps=tune_steps,
         p_acc_rate=p_acc_rate,
         threshold=threshold,
-        epsilon=epsilon,
-        dist_func=dist_func,
-        sum_stat=sum_stat,
+        save_sim_data=save_sim_data,
         model=model,
         random_seed=random_seed,
         chain=chain,
@@ -262,4 +260,11 @@ def sample_smc_int(
         accept_ratios.append(smc.acc_rate)
         nsteps.append(smc.n_steps)
 
-    return smc.posterior_to_trace(), smc.log_marginal_likelihood, betas, accept_ratios, nsteps
+    return (
+        smc.posterior_to_trace(),
+        smc.sim_data,
+        smc.log_marginal_likelihood,
+        betas,
+        accept_ratios,
+        nsteps,
+    )