diff --git a/pymare/core.py b/pymare/core.py index 57e6e73..3a3f7e8 100644 --- a/pymare/core.py +++ b/pymare/core.py @@ -6,7 +6,9 @@ import pandas as pd from .estimators import (WeightedLeastSquares, DerSimonianLaird, - LikelihoodBased, StanMetaRegression) + VarianceBasedLikelihoodEstimator, + SampleSizeBasedLikelihoodEstimator, + StanMetaRegression) from .stats import ensure_2d @@ -15,9 +17,12 @@ class Dataset: Args: estimates (array-like): 1d array of study-level estimates with length K - variances (array-like): 1d array of study-level variances with length K + variances (array-like, optional): 1d array of study-level variances + with length K predictors (array-like, optional): 1d or 2d array containing study-level predictors (or covariates); has dimensions K x P + sample_sizes (array-like, optional): 1d array of study-level sample + sizes (length K) names ([str], optional): List of length P containing the names of the predictors add_intercept (bool, optional): If True, an intercept column is @@ -25,12 +30,13 @@ class Dataset: predictors matrix is passed as-is to estimators. kwargs (dict, optional): Keyword arguments to pass onto estimators """ - def __init__(self, estimates, variances, predictors=None, names=None, - add_intercept=True, **kwargs): + def __init__(self, estimates, variances=None, predictors=None, + sample_sizes=None, names=None, add_intercept=True, **kwargs): self.estimates = ensure_2d(estimates) self.variances = ensure_2d(variances) + self.sample_sizes = ensure_2d(sample_sizes) self.kwargs = kwargs - X, n = self._setup_predictors(predictors, names, add_intercept) + X, n = self._get_predictors(predictors, names, add_intercept) self.predictors = X self.names = n @@ -38,9 +44,11 @@ def __getattr__(self, key): # Provide convenient access to stored kwargs. if key in self.kwargs: return self.kwargs[key] - raise AttributeError + raise AttributeError("{} object has no attribute {}".format( + self.__class__.__name__, key)) - def _setup_predictors(self, X, names, add_intercept): + + def _get_predictors(self, X, names, add_intercept): if X is None and not add_intercept: raise ValueError("No fixed predictors found. If no X matrix is " "provided, add_intercept must be True!") @@ -67,17 +75,26 @@ def X(self): """Alias for the `predictors` attribute.""" return self.predictors + @property + def n(self): + """Alias for the `sample_sizes` attribute.""" + return self.sample_sizes + -def meta_regression(estimates, variances, predictors=None, names=None, - add_intercept=True, method='ML', ci_method='QP', - alpha=0.05, **kwargs): +def meta_regression(estimates, variances=None, predictors=None, + sample_sizes=None, names=None, add_intercept=True, + method='ML', ci_method='QP', alpha=0.05, **kwargs): """Fits the standard meta-regression/meta-analysis model to provided data. Args: - estimates ([float]): 1d array of study-level estimates with length K - variances ([float]): 1d array of study-level variances with length K - predictors ([float], optional): 1d or 2d array containing study-level - predictors (or covariates); has dimensions K x P + estimates (array-like): 1d array of study-level estimates with length K + variances (array-like, optional): 1d array of study-level variances + with length K + predictors (array-like, optional): 1d or 2d array containing + study-level predictors (or covariates); has dimensions K x P. If + omitted, add_intercept must be True. + sample_sizes (array-like, optional): 1d array of study-level sample + sizes (length K) names ([str], optional): List of length P containing the names of the predictors add_intercept (bool, optional): If True, an intercept column is @@ -102,21 +119,29 @@ def meta_regression(estimates, variances, predictors=None, names=None, depending on the specified method ('Stan' will return the latter; all other methods return the former). """ - dataset = Dataset(estimates, variances, predictors, names, add_intercept) + dataset = Dataset(estimates, variances, predictors, sample_sizes, names, + add_intercept) method = method.lower() - estimator_cls = { - 'ml': partial(LikelihoodBased, method=method), - 'reml': partial(LikelihoodBased, method=method), - 'dl': DerSimonianLaird, - 'wls': WeightedLeastSquares, - 'fe': WeightedLeastSquares, - 'stan': StanMetaRegression, - }[method] + if method in ['ml', 'reml']: + if variances is not None: + est_cls = partial(VarianceBasedLikelihoodEstimator, method=method) + elif sample_sizes is not None: + est_cls = partial(SampleSizeBasedLikelihoodEstimator, method=method) + else: + raise ValueError("If method is ML or REML, one of `variances` or " + "`sample sizes` must be passed!") + else: + est_cls = { + 'dl': DerSimonianLaird, + 'wls': WeightedLeastSquares, + 'fe': WeightedLeastSquares, + 'stan': StanMetaRegression, + }[method] # Get estimates - est = estimator_cls(**kwargs) + est = est_cls(**kwargs) results = est.fit(dataset) if hasattr(results, 'compute_stats'): results.compute_stats(ci_method=ci_method, alpha=alpha) diff --git a/pymare/estimators/__init__.py b/pymare/estimators/__init__.py index ef1b7ab..dc18e8f 100644 --- a/pymare/estimators/__init__.py +++ b/pymare/estimators/__init__.py @@ -1,9 +1,12 @@ from .estimators import (WeightedLeastSquares, DerSimonianLaird, - LikelihoodBased, StanMetaRegression) + VarianceBasedLikelihoodEstimator, + SampleSizeBasedLikelihoodEstimator, + StanMetaRegression) __all__ = [ 'WeightedLeastSquares', 'DerSimonianLaird', - 'LikelihoodBased', + 'VarianceBasedLikelihoodEstimator', + 'SampleSizeBasedLikelihoodEstimator', 'StanMetaRegression' ] diff --git a/pymare/estimators/estimators.py b/pymare/estimators/estimators.py index cc90fed..f1b474e 100644 --- a/pymare/estimators/estimators.py +++ b/pymare/estimators/estimators.py @@ -1,6 +1,7 @@ """Meta-regression estimator classes.""" from abc import ABCMeta, abstractmethod +from inspect import getfullargspec import numpy as np from scipy.optimize import minimize @@ -14,14 +15,43 @@ class BaseEstimator(metaclass=ABCMeta): _result_cls = MetaRegressionResults @abstractmethod - def _fit(self, y, v, X): - # Subclasses should retain this minimal signature in all cases; this - # ensures that the fit() wrapper in the base class can be ignored by - # users who want to avoid input/output overhead and call _fit directly. + def _fit(self): + # Subclasses must implement _fit() method that directly takes arrays. + # The following named arguments are allowed, and will be automatically + # extracted from the Dataset instance: + # * y (estimates) + # * v (variances) + # * n (sample_sizes) + # * X (predictors) pass + def accepts_dataset(self, dataset): + """ Returns whether current class can fit the passed Dataset. + + Args: + dataset (Dataset): A Dataset instance + + Returns: + A boolean. + """ + args = getfullargspec(self._fit)[0][1:] + for name in args: + if getattr(dataset, name) is None: + return False + return True + def fit(self, dataset): - results = self._fit(dataset.y, dataset.v, dataset.X) + kwargs = {} + spec = getfullargspec(self._fit) + n_kw = len(spec.defaults) if spec.defaults else 0 + n_args = len(spec.args) - n_kw - 1 + for i, name in enumerate(spec.args[1:]): + if i >= n_args: + kwargs[name] = getattr(dataset, name, spec.defaults[i-n_args]) + else: + kwargs[name] = getattr(dataset, name) + + results = self._fit(**kwargs) return self._result_cls(results, dataset, self) @@ -73,20 +103,16 @@ def _fit(self, y, v, X): return {'beta': beta_dl, 'tau2': tau_dl} -class LikelihoodBased(BaseEstimator): - """ Likelihood-based meta-regression estimator. +class VarianceBasedLikelihoodEstimator(BaseEstimator): + """ Likelihood-based estimator for estimates with known variances. - Iteratively estimates the between-subject variance tau^2 and fixed effects + Iteratively estimates the between-subject variance tau^2 and fixed effect betas using the specified likelihood-based estimator (ML or REML). Args: method (str, optional): The estimation method to use. Either 'ML' (for maximum-likelihood) or 'REML' (restricted maximum-likelihood). Defaults to 'ML'. - beta (array, optional): Initial beta values to use in optimization. If - None (default), the DerSimonian-Laird estimate is used. - tau2 (float, optional): Initial tau^2 value to use in optimization. If - None (default), the DerSimonian-Laird estimate is used. kwargs (dict, optional): Keyword arguments to pass to the SciPy minimizer. @@ -96,26 +122,22 @@ class LikelihoodBased(BaseEstimator): passed in as keyword arguments. """ - def __init__(self, method='ml', beta=None, tau2=None, **kwargs): - self.method = method - self.beta = beta - self.tau2 = tau2 + def __init__(self, method='ml', **kwargs): + nll_func = getattr(self, '_{}_nll'.format(method.lower())) + if nll_func is None: + raise ValueError("No log-likelihood function defined for method " + "'{}'.".format(method)) + self._nll_func = nll_func self.kwargs = kwargs def _fit(self, y, v, X): - # use D-L estimate for initial values if none provided - if self.tau2 is None or self.beta is None: - est_DL = DerSimonianLaird()._fit(y, v, X) - beta = est_DL['beta'] if self.beta is None else self.beta - tau2 = est_DL['tau2'] if self.tau2 is None else self.tau2 - - ll_func = getattr(self, '_{}_nll'.format(self.method.lower())) - if ll_func is None: - raise ValueError("No log-likelihood function defined for method " - "'{}'.".format(self.method)) + # use D-L estimate for initial values + est_DL = DerSimonianLaird()._fit(y, v, X) + beta = est_DL['beta'] + tau2 = est_DL['tau2'] theta_init = np.r_[beta.ravel(), tau2] - res = minimize(ll_func, theta_init, (y, v, X), **self.kwargs).x + res = minimize(self._nll_func, theta_init, (y, v, X), **self.kwargs).x beta, tau = res[:-1], float(res[-1]) tau = np.max([tau, 0]) return {'beta': beta, 'tau2': tau} @@ -127,8 +149,7 @@ def _ml_nll(self, theta, y, v, X): tau2 = 0 w = 1. / (v + tau2) R = y - X.dot(beta) - ll = 0.5 * (np.log(w).sum() - (R * w * R).sum()) - return -ll + return -0.5 * (np.log(w).sum() - (R * w * R).sum()) def _reml_nll(self, theta, y, v, X): """ REML negative log-likelihood for meta-regression model. """ @@ -139,6 +160,69 @@ def _reml_nll(self, theta, y, v, X): return ll_ + 0.5 * np.log(np.linalg.det(F)) +class SampleSizeBasedLikelihoodEstimator(BaseEstimator): + """ Likelihood-based estimator for estimates with known sample sizes but + unknown variances. + + Iteratively estimates the between-subject variance tau^2 and fixed effect + betas using the specified likelihood-based estimator (ML or REML). + + Args: + method (str, optional): The estimation method to use. Either 'ML' (for + maximum-likelihood) or 'REML' (restricted maximum-likelihood). + Defaults to 'ML'. + beta (array, optional): Initial beta values to use in optimization. If + None (default), uses the weighted least squares estimate. + tau2 (float, optional): Initial tau^2 value to use in optimization. + Defaults to 0. + kwargs (dict, optional): Keyword arguments to pass to the SciPy + minimizer. + + Notes: + The ML and REML solutions are obtained via SciPy's scalar function + minimizer (scipy.optimize.minimize). Parameters to minimize() can be + passed in as keyword arguments. + """ + + def __init__(self, method='ml', **kwargs): + nll_func = getattr(self, '_{}_nll'.format(method.lower())) + if nll_func is None: + raise ValueError("No log-likelihood function defined for method " + "'{}'.".format(method)) + self._nll_func = nll_func + self.kwargs = kwargs + + def _fit(self, y, n, X): + # set tau^2 to 0 and compute starting values + tau2 = 0. + beta = WeightedLeastSquares(tau2=tau2)._fit(y, n, X)['beta'] + sigma = ((y - X.dot(beta))**2 * n).mean() + theta_init = np.r_[beta.ravel(), sigma, tau2] + res = minimize(self._nll_func, theta_init, (y, n, X), **self.kwargs).x + beta, sigma, tau = res[:-2], float(res[-2]), float(res[-1]) + tau = np.max([tau, 0]) + return {'beta': beta, 'sigma2': sigma, 'tau2': tau} + + def _ml_nll(self, theta, y, n, X): + """ ML negative log-likelihood for meta-regression model. """ + beta, sigma2, tau2 = theta[:-2, None], theta[-2], theta[-1] + if tau2 < 0: + tau2 = 0 + if sigma2 < 0: + sigma2 = 0 + w = 1 / (tau2 + sigma2 / n) + R = y - X.dot(beta) + return -0.5 * (np.log(w).sum() - (R * w * R).sum()) + + def _reml_nll(self, theta, y, n, X): + """ REML negative log-likelihood for meta-regression model. """ + ll_ = self._ml_nll(theta, y, n, X) + sigma2, tau2 = theta[-2:] + w = 1 / (tau2 + sigma2 / n) + F = (X * w).T.dot(X) + return ll_ + 0.5 * np.log(np.linalg.det(F)) + + class StanMetaRegression(BaseEstimator): """Bayesian meta-regression estimator using Stan. diff --git a/tests/conftest.py b/tests/conftest.py index 89c0ecc..70775cd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,14 @@ def dataset(variables): return Dataset(*variables, names=['my_covariate']) +@pytest.fixture(scope='package') +def dataset_n(): + y = np.array([[-3., -0.5, 0., -5.01, 0.35, -2., -6., -4., -4.3, -0.1, -1.]]).T + n = np.array([[16, 16, 20.548, 32.597, 14., 11.118, 4.444, 12.414, 26.963, + 130.556, 126.76]]).T / 2 + return Dataset(y, sample_sizes=n) + + @pytest.fixture(scope='package') def vars_with_intercept(): y = np.array([[-1, 0.5, 0.5, 0.5, 1, 1, 2, 10]]).T diff --git a/tests/test_estimators.py b/tests/test_estimators.py index 2ef0898..fbdb9da 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -1,7 +1,9 @@ import numpy as np from pymare.estimators import (WeightedLeastSquares, DerSimonianLaird, - LikelihoodBased, StanMetaRegression) + VarianceBasedLikelihoodEstimator, + SampleSizeBasedLikelihoodEstimator, + StanMetaRegression) def test_weighted_least_squares_estimator(dataset): @@ -26,22 +28,44 @@ def test_dersimonian_laird_estimator(dataset): assert np.allclose(tau2, 8.3627, atol=1e-4) -def test_maximum_likelihood_estimator(dataset): +def test_variance_based_maximum_likelihood_estimator(dataset): # ground truth values are from metafor package in R - results = LikelihoodBased(method='ML').fit(dataset) + results = VarianceBasedLikelihoodEstimator(method='ML').fit(dataset) beta, tau2 = results['beta']['est'], results['tau2']['est'] assert np.allclose(beta, [-0.1072, 0.7653], atol=1e-4) assert np.allclose(tau2, 7.7649, atol=1e-4) -def test_restricted_maximum_likelihood_estimator(dataset): +def test_variance_based_restricted_maximum_likelihood_estimator(dataset): # ground truth values are from metafor package in R - results = LikelihoodBased(method='REML').fit(dataset) + results = VarianceBasedLikelihoodEstimator(method='REML').fit(dataset) beta, tau2 = results['beta']['est'], results['tau2']['est'] assert np.allclose(beta, [-0.1066, 0.7700], atol=1e-4) assert np.allclose(tau2, 10.9499, atol=1e-4) +def test_sample_size_based_maximum_likelihood_estimator(dataset_n): + # ground truth values are from metafor package in R + results = SampleSizeBasedLikelihoodEstimator(method='ML').fit(dataset_n) + beta = results['beta']['est'] + sigma2 = results['sigma2']['est'] + tau2 = results['tau2']['est'] + assert np.allclose(beta, [-2.0951], atol=1e-4) + assert np.allclose(sigma2, 12.777, atol=1e-4) + assert np.allclose(tau2, 2.8268, atol=1e-4) + + +def test_sample_size_based_restricted_maximum_likelihood_estimator(dataset_n): + # ground truth values are from metafor package in R + results = SampleSizeBasedLikelihoodEstimator(method='REML').fit(dataset_n) + beta = results['beta']['est'] + sigma2 = results['sigma2']['est'] + tau2 = results['tau2']['est'] + assert np.allclose(beta, [-2.1071], atol=1e-4) + assert np.allclose(sigma2, 13.048, atol=1e-4) + assert np.allclose(tau2, 3.2177, atol=1e-4) + + def test_stan_estimator(dataset): # no ground truth here, so we use sanity checks and rough bounnds results = StanMetaRegression(iter=2500).fit(dataset)