Merge pull request #50 from neurostuff/better-dataset-handling

tyarkoni · web-flow · commit 4fd19c976625 · 2020-10-08T17:34:43.000-05:00
[REF] Improved Dataset fitting
diff --git a/README.md b/README.md
@@ -60,10 +60,25 @@ from pymare.estimators import VarianceBasedLikelihoodEstimator
 dataset = Dataset(y, v, X)
 # Estimator class for likelihood-based methods when variances are known
 estimator = VarianceBasedLikelihoodEstimator(method='REML')
-# All estimators accept a `Dataset` instance as the first argument to `.fit()`
-estimator.fit(dataset)
+# All estimators expose a fit_dataset() method that takes a `Dataset`
+# instance as the first (and usually only) argument.
+estimator.fit_dataset(dataset)
 # Post-fitting we can obtain a MetaRegressionResults instance via .summary()
 results = estimator.summary()
 # Print summary of results as a pandas DataFrame
 print(result.to_df())
 ```
+
+And if we want to be even more explicit, we can avoid the `Dataset` abstraction
+entirely (though we'll lose some convenient validation checks):
+
+```python
+estimator = VarianceBasedLikelihoodEstimator(method='REML')
+
+# X must be 2-d; this is one of the things the Dataset implicitly handles.
+X = X[:, None]
+
+estimator.fit(y, v, X)
+
+results = estimator.summary()
+```
diff --git a/examples/02_meta-analysis/plot_run_meta-analysis.py b/examples/02_meta-analysis/plot_run_meta-analysis.py
@@ -29,6 +29,6 @@
 # Datasets can also be created from pandas DataFrames
 # ---------------------------------------------------
 dataset = core.Dataset(v=v, X=X, y=y, n=n)
-est = estimators.WeightedLeastSquares().fit(dataset)
+est = estimators.WeightedLeastSquares().fit_dataset(dataset)
 results = est.summary()
 print(results.to_df())
diff --git a/pymare/core.py b/pymare/core.py
@@ -144,5 +144,5 @@ def meta_regression(y=None, v=None, X=None, n=None, data=None, X_names=None,
 
     # Get estimates
     est = est_cls(**kwargs)
-    est.fit(data)
+    est.fit_dataset(data)
     return est.summary()
diff --git a/pymare/estimators/combination.py b/pymare/estimators/combination.py
@@ -29,17 +29,18 @@ def p_value(self, z, *args, **kwargs):
     def _z_to_p(self, z):
         return ss.norm.sf(z)
 
-    def _fit(self, y, *args, **kwargs):
+    def fit(self, z, *args, **kwargs):
         if self.mode == 'concordant':
             ose = self.__class__(mode='directed')
-            p1 = ose.p_value(y, *args, **kwargs)
-            p2 = ose.p_value(-y, *args, **kwargs)
+            p1 = ose.p_value(z, *args, **kwargs)
+            p2 = ose.p_value(-z, *args, **kwargs)
             p = np.minimum(1, 2 * np.minimum(p1, p2))
         else:
             if self.mode == 'undirected':
-                y = np.abs(y)
-            p = self.p_value(y, *args, **kwargs)
-        return {'p': p}
+                z = np.abs(z)
+            p = self.p_value(z, *args, **kwargs)
+        self.params_ = {'p': p}
+        return self
 
     def summary(self):
         if not hasattr(self, 'params_'):
@@ -85,6 +86,13 @@ class StoufferCombinationTest(CombinationTest):
         (3) This estimator does not support meta-regression; any moderators
             passed in to fit() as the X array will be ignored.
     """
+
+    # Maps Dataset attributes onto fit() args; see BaseEstimator for details.
+    _dataset_attr_map = {'z': 'y', 'w': 'v'}
+
+    def fit(self, z, w=None):
+        return super().fit(z, w=w)
+
     def p_value(self, z, w=None):
         if w is None:
             w = np.ones_like(z)
@@ -128,6 +136,10 @@ class FisherCombinationTest(CombinationTest):
         (3) This estimator does not support meta-regression; any moderators
             passed in to fit() as the X array will be ignored.
     """
+
+    # Maps Dataset attributes onto fit() args; see BaseEstimator for details.
+    _dataset_attr_map = {'z': 'y'}
+
     def p_value(self, z):
         p = self._z_to_p(z)
         chi2 = -2 * np.log(p).sum(0)
diff --git a/pymare/estimators/estimators.py b/pymare/estimators/estimators.py
@@ -16,7 +16,7 @@
 
 @wrapt.decorator
 def _loopable(wrapped, instance, args, kwargs):
-    # Decorator for _fit method of Estimator classes to handle naive looping
+    # Decorator for fit() method of Estimator classes to handle naive looping
     # over the 2nd dimension of y/v/n inputs, and reconstruction of outputs.
     n_iter = kwargs['y'].shape[1]
     if n_iter > 10:
@@ -26,6 +26,7 @@ def _loopable(wrapped, instance, args, kwargs):
                 "datasets. Consider using the DL, HE, or WLS estimators, "
                 "which handle parallel datasets more efficiently."
                 .format(n_iter))
+
     param_dicts = []
     for i in range(n_iter):
         iter_kwargs = {'X': kwargs['X']}
@@ -35,41 +36,58 @@ def _loopable(wrapped, instance, args, kwargs):
         if 'n' in kwargs:
             n = kwargs['n'][:, i, None] if kwargs['n'].shape[1] > 1 else kwargs['n']
             iter_kwargs['n'] = n
-        param_dicts.append(wrapped(**iter_kwargs))
+        wrapped(**iter_kwargs)
+        param_dicts.append(instance.params_.copy())
+
     params = {}
     for k in param_dicts[0]:
         concat = np.stack([pd[k].squeeze() for pd in param_dicts], axis=-1)
         params[k] = np.atleast_2d(concat)
-    return params
+
+    instance.params_ = params
+    return instance
 
 
 class BaseEstimator(metaclass=ABCMeta):
 
+    # A class-level mapping from Dataset attributes to fit() arguments. Used by
+    # fit_dataset() for estimators that take non-standard arguments (e.g., 'z'
+    # instead of 'y'). Keys are default Dataset attribute names (e.g., 'y') and
+    # values are the target arg names in the estimator class's fit() method
+    # (e.g., 'z').
+    _dataset_attr_map = {}
+
     @abstractmethod
-    def _fit(self):
-        # Subclasses must implement _fit() method that directly takes arrays.
-        # The following named arguments are allowed, and will be automatically
-        # extracted from the Dataset instance:
-        # * y (estimates)
-        # * v (variances)
-        # * n (sample_sizes)
-        # * X (predictors)
+    def fit(self, *args, **kwargs):
         pass
 
-    def fit(self, dataset=None, **kwargs):
+    def fit_dataset(self, dataset, *args, **kwargs):
+        """ Applies the current estimator to the passed Dataset container.
 
-        if dataset is not None:
-            kwargs = {}
-            spec = getfullargspec(self._fit)
-            n_kw = len(spec.defaults) if spec.defaults else 0
-            n_args = len(spec.args) - n_kw - 1
-            for i, name in enumerate(spec.args[1:]):
-                if i >= n_args:
-                    kwargs[name] = getattr(dataset, name, spec.defaults[i - n_args])
-                else:
-                    kwargs[name] = getattr(dataset, name)
+        A convenience interface that wraps fit() and automatically aligns the
+        variables held in a Dataset with the required arguments.
 
-        self.params_ = self._fit(**kwargs)
+        Args:
+            dataset (Dataset): A PyMARE Dataset instance holding the data.
+            args, kwargs: optional positional and keyword arguments to pass
+                onto the fit() method.
+        """
+        all_kwargs = {}
+        spec = getfullargspec(self.fit)
+        n_kw = len(spec.defaults) if spec.defaults else 0
+        n_args = len(spec.args) - n_kw - 1
+
+        for i, name in enumerate(spec.args[1:]):
+            # Check for remapped name
+            attr_name = self._dataset_attr_map.get(name, name)
+            if i >= n_args:
+                all_kwargs[name] = getattr(dataset, attr_name,
+                                           spec.defaults[i - n_args])
+            else:
+                all_kwargs[name] = getattr(dataset, attr_name)
+
+        all_kwargs.update(kwargs)
+        self.fit(*args, **all_kwargs)
         self.dataset_ = dataset
 
         return self
@@ -86,7 +104,7 @@ def get_v(self, dataset):
         Notes:
             This is equivalent to directly accessing `dataset.v` when variances
             are present, but affords a way of estimating v from sample size (n)
-            for any estimator that implicitly estimate a sigma^2 parameter.
+            for any estimator that implicitly estimates a sigma^2 parameter.
         """
         if dataset.v is not None:
             return dataset.v
@@ -139,12 +157,13 @@ class WeightedLeastSquares(BaseEstimator):
     def __init__(self, tau2=0.):
         self.tau2 = tau2
 
-    def _fit(self, y, X, v=None):
+    def fit(self, y, X, v=None):
         if v is None:
             v = np.ones_like(y)
         beta, inv_cov = weighted_least_squares(y, v, X, self.tau2,
                                                return_cov=True)
-        return {'fe_params': beta, 'tau2': self.tau2, 'inv_cov': inv_cov}
+        self.params_ = {'fe_params': beta, 'tau2': self.tau2, 'inv_cov': inv_cov}
+        return self
 
 
 class DerSimonianLaird(BaseEstimator):
@@ -167,7 +186,7 @@ class DerSimonianLaird(BaseEstimator):
         identical for all iterates.
     """
 
-    def _fit(self, y, v, X):
+    def fit(self, y, v, X):
         k, p = X.shape
 
         # Estimate initial betas with WLS, assuming tau^2=0
@@ -189,7 +208,8 @@ def _fit(self, y, v, X):
         # Re-estimate beta with tau^2 estimate
         beta_dl, inv_cov = weighted_least_squares(y, v, X, tau2=tau_dl,
                                                   return_cov=True)
-        return {'fe_params': beta_dl, 'tau2': tau_dl, 'inv_cov': inv_cov}
+        self.params_ = {'fe_params': beta_dl, 'tau2': tau_dl, 'inv_cov': inv_cov}
+        return self
 
 
 class Hedges(BaseEstimator):
@@ -208,7 +228,7 @@ class Hedges(BaseEstimator):
         identical for all iterates.
     """
 
-    def _fit(self, y, v, X):
+    def fit(self, y, v, X):
         k, p = X.shape[:2]
         _unit_v = np.ones_like(y)
         beta, inv_cov = weighted_least_squares(y, _unit_v, X, return_cov=True)
@@ -217,7 +237,8 @@ def _fit(self, y, v, X):
         tau_ho = np.maximum(0, tau_ho)
         # Estimate beta with tau^2 estimate
         beta_ho = weighted_least_squares(y, v, X, tau2=tau_ho)
-        return {'fe_params': beta_ho, 'tau2': tau_ho, 'inv_cov': inv_cov}
+        self.params_ = {'fe_params': beta_ho, 'tau2': tau_ho, 'inv_cov': inv_cov}
+        return self
 
 
 class VarianceBasedLikelihoodEstimator(BaseEstimator):
@@ -255,9 +276,9 @@ def __init__(self, method='ml', **kwargs):
         self.kwargs = kwargs
 
     @_loopable
-    def _fit(self, y, v, X):
+    def fit(self, y, v, X):
         # use D-L estimate for initial values
-        est_DL = DerSimonianLaird()._fit(y, v, X)
+        est_DL = DerSimonianLaird().fit(y, v, X).params_
         beta = est_DL['fe_params']
         tau2 = est_DL['tau2']
 
@@ -273,7 +294,8 @@ def _fit(self, y, v, X):
         beta, tau = res.x[:-1], float(res.x[-1])
         tau = np.max([tau, 0])
         _, inv_cov = weighted_least_squares(y, v, X, tau, True)
-        return {'fe_params': beta[:, None], 'tau2': tau, 'inv_cov': inv_cov}
+        self.params_ = {'fe_params': beta[:, None], 'tau2': tau, 'inv_cov': inv_cov}
+        return self
 
     def _ml_nll(self, theta, y, v, X):
         """ ML negative log-likelihood for meta-regression model. """
@@ -329,7 +351,7 @@ def __init__(self, method='ml', **kwargs):
         self.kwargs = kwargs
 
     @_loopable
-    def _fit(self, y, n, X):
+    def fit(self, y, n, X):
         if n.std() < np.sqrt(np.finfo(float).eps):
             raise ValueError("Sample size-based likelihood estimator cannot "
                              "work with all-equal sample sizes.")
@@ -353,8 +375,13 @@ def _fit(self, y, n, X):
         beta, sigma, tau = res.x[:-2], float(res.x[-2]), float(res.x[-1])
         tau = np.max([tau, 0])
         _, inv_cov = weighted_least_squares(y, sigma / n, X, tau, True)
-        return {'fe_params': beta[:, None], 'sigma2': np.array(sigma), 'tau2': tau,
-                'inv_cov': inv_cov}
+        self.params_ = {
+            'fe_params': beta[:, None],
+            'sigma2': np.array(sigma),
+            'tau2': tau,
+            'inv_cov': inv_cov
+        }
+        return self
 
     def _ml_nll(self, theta, y, n, X):
         """ ML negative log-likelihood for meta-regression model. """
@@ -431,7 +458,7 @@ def compile(self):
         from pystan import StanModel
         self.model = StanModel(model_code=spec)
 
-    def _fit(self, y, v, X, groups=None):
+    def fit(self, y, v, X, groups=None):
         """Run the Stan sampler and return results.
 
         Args:
@@ -479,7 +506,7 @@ def _fit(self, y, v, X, groups=None):
         }
 
         self.result_ = self.model.sampling(data=data, **self.sampling_kwargs)
-        return self.result_
+        return self
 
     def summary(self, ci=95):
         if self.result_ is None:
diff --git a/pymare/results.py b/pymare/results.py
@@ -177,7 +177,7 @@ def permutation_test(self, n_perm=1000):
             y_perm = np.repeat(y[:, None], n_perm, axis=1)
 
             # for v, we might actually be working with n, depending on estimator
-            has_v = 'v' in getfullargspec(self.estimator._fit).args[1:]
+            has_v = 'v' in getfullargspec(self.estimator.fit).args[1:]
             v = self.dataset.v[:, i] if has_v else self.dataset.n[:, i]
 
             v_perm = np.repeat(v[:, None], n_perm, axis=1)
@@ -203,7 +203,7 @@ def permutation_test(self, n_perm=1000):
             # Pass parameters, remembering that v may actually be n
             kwargs = {'y': y_perm, 'X': self.dataset.X}
             kwargs['v' if has_v else 'n'] = v_perm
-            params = self.estimator._fit(**kwargs)
+            params = self.estimator.fit(**kwargs).params_
 
             fe_obs = fe_stats['est'][:, i]
             if fe_obs.ndim == 1:
@@ -304,10 +304,10 @@ def permutation_test(self, n_perm=1000):
                 y_perm *= signs
 
             # Some combination tests can handle weights (passed as v)
-            kwargs = {'y': y_perm}
-            if 'v' in getfullargspec(est._fit).args:
-                kwargs['v'] = self.dataset.v
-            params = est._fit(**kwargs)
+            kwargs = {'z': y_perm}
+            if 'w' in getfullargspec(est.fit).args:
+                kwargs['w'] = self.dataset.v
+            params = est.fit(**kwargs).params_
 
             p_obs = self.z[i]
             if p_obs.ndim == 1:
diff --git a/pymare/tests/test_combination_tests.py b/pymare/tests/test_combination_tests.py
@@ -27,15 +27,15 @@
 
 @pytest.mark.parametrize("Cls,data,mode,expected", _params)
 def test_combination_test(Cls, data, mode, expected):
-    results = Cls(mode)._fit(data)
+    results = Cls(mode).fit(data).params_
     z = ss.norm.isf(results['p'])
     assert np.allclose(z, expected, atol=1e-5)
 
 
 @pytest.mark.parametrize("Cls,data,mode,expected", _params)
 def test_combination_test_from_dataset(Cls, data, mode, expected):
     dset = Dataset(y=data)
-    est = Cls(mode).fit(dset)
+    est = Cls(mode).fit_dataset(dset)
     results = est.summary()
     z = ss.norm.isf(results.p)
     assert np.allclose(z, expected, atol=1e-5)
diff --git a/pymare/tests/test_estimators.py b/pymare/tests/test_estimators.py
diff --git a/pymare/tests/test_results.py b/pymare/tests/test_results.py
diff --git a/pymare/tests/test_stan_estimators.py b/pymare/tests/test_stan_estimators.py