|
| 1 | +# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- |
| 2 | +# ex: set sts=4 ts=4 sw=4 et: |
| 3 | +r""" |
| 4 | +.. _meta1: |
| 5 | +
|
| 6 | +================================================ |
| 7 | + Run Estimators on a simulated dataset |
| 8 | +================================================ |
| 9 | +
|
| 10 | +PyMARE implements a range of meta-analytic estimators. |
| 11 | +In this example, we build a simulated dataset with a known ground truth and |
| 12 | +use it to compare PyMARE's estimators. |
| 13 | +
|
| 14 | +.. note:: |
| 15 | + The variance of the true effect is composed of both between-study variance |
| 16 | + (:math:`\tau^{2}`) and within-study variance (:math:`\sigma^{2}`). |
| 17 | + Within-study variance is generally taken from sampling variance values from |
| 18 | + individual studies (``v``), while between-study variance can be estimated |
| 19 | + via a number of methods. |
| 20 | +""" |
| 21 | +# sphinx_gallery_thumbnail_number = 3 |
| 22 | +import numpy as np |
| 23 | +import matplotlib.pyplot as plt |
| 24 | +from scipy import stats |
| 25 | +import seaborn as sns |
| 26 | + |
| 27 | +import pymare |
| 28 | + |
| 29 | + |
| 30 | +# A small function to make things easier later on |
| 31 | +def var_to_ci(y, v, n): |
| 32 | + """Convert sampling variance to 95% CI""" |
| 33 | + term = 1.96 * np.sqrt(v) / np.sqrt(n) |
| 34 | + return y - term, y + term |
| 35 | + |
| 36 | + |
| 37 | +############################################################################### |
| 38 | +# Here we simulate a dataset |
| 39 | +# ----------------------------------------------------------------------------- |
| 40 | +# This is a simple dataset with a one-sample design. |
| 41 | +# We are interested in estimating the true effect size from a set of one-sample |
| 42 | +# studies. |
| 43 | +N_STUDIES = 40 |
| 44 | +BETWEEN_STUDY_VAR = 400 # population variance |
| 45 | +between_study_sd = np.sqrt(BETWEEN_STUDY_VAR) |
| 46 | +TRUE_EFFECT = 20 |
| 47 | +sample_sizes = np.round(np.random.normal(loc=50, scale=20, size=N_STUDIES)).astype(int) |
| 48 | +within_study_vars = np.random.normal(loc=400, scale=100, size=N_STUDIES) |
| 49 | +study_means = np.random.normal(loc=TRUE_EFFECT, scale=between_study_sd, size=N_STUDIES) |
| 50 | + |
| 51 | +sample_sizes[sample_sizes <= 1] = 2 |
| 52 | +within_study_vars = np.abs(within_study_vars) |
| 53 | + |
| 54 | +# Convert data types and match PyMARE nomenclature |
| 55 | +y = study_means |
| 56 | +X = np.ones((N_STUDIES)) |
| 57 | +v = within_study_vars |
| 58 | +n = sample_sizes |
| 59 | +sd = np.sqrt(v * n) |
| 60 | +z = y / sd |
| 61 | +p = stats.norm.sf(abs(z)) * 2 |
| 62 | + |
| 63 | +############################################################################### |
| 64 | +# Plot variable distributions |
| 65 | +# ----------------------------------------------------------------------------- |
| 66 | +fig, axes = plt.subplots(nrows=5, figsize=(16, 10)) |
| 67 | +sns.distplot(y, ax=axes[0], bins=20) |
| 68 | +axes[0].set_title('y') |
| 69 | +sns.distplot(v, ax=axes[1], bins=20) |
| 70 | +axes[1].set_title('v') |
| 71 | +sns.distplot(n, ax=axes[2], bins=20) |
| 72 | +axes[2].set_title('n') |
| 73 | +sns.distplot(z, ax=axes[3], bins=20) |
| 74 | +axes[3].set_title('z') |
| 75 | +sns.distplot(p, ax=axes[4], bins=20) |
| 76 | +axes[4].set_title('p') |
| 77 | +for i in range(5): |
| 78 | + axes[i].set_yticks([]) |
| 79 | +fig.tight_layout() |
| 80 | + |
| 81 | +############################################################################### |
| 82 | +# Plot means and confidence intervals |
| 83 | +# ----------------------------------- |
| 84 | +# Here we can show study-wise mean effect and CIs, along with the true effect |
| 85 | +# and CI corresponding to the between-study variance. |
| 86 | +fig, ax = plt.subplots(figsize=(8, 16)) |
| 87 | +study_ticks = np.arange(N_STUDIES) |
| 88 | + |
| 89 | +# Get 95% CI for individual studies |
| 90 | +lower_bounds, upper_bounds = var_to_ci(y, v, n) |
| 91 | +ax.scatter(y, study_ticks+1) |
| 92 | +for study in study_ticks: |
| 93 | + ax.plot((lower_bounds[study], upper_bounds[study]), (study+1, study+1), color='blue') |
| 94 | +ax.axvline(0, color='gray', alpha=0.2, linestyle='--', label='Zero') |
| 95 | +ax.axvline(np.mean(y), color='orange', alpha=0.2, label='Mean of Observed Effects') |
| 96 | + |
| 97 | +# Get 95% CI for true effect |
| 98 | +lower_bound, upper_bound = var_to_ci(TRUE_EFFECT, BETWEEN_STUDY_VAR, 1) |
| 99 | +ax.scatter((TRUE_EFFECT,), (N_STUDIES+1,), color='green', label='True Effect') |
| 100 | +ax.plot((lower_bound, upper_bound), (N_STUDIES+1, N_STUDIES+1), |
| 101 | + color='green', linewidth=3, label='Between-Study 95% CI') |
| 102 | +ax.set_ylim((0, N_STUDIES+2)) |
| 103 | +ax.set_xlabel('Mean (95% CI)') |
| 104 | +ax.set_ylabel('Study') |
| 105 | +ax.legend() |
| 106 | +fig.tight_layout() |
| 107 | + |
| 108 | +############################################################################### |
| 109 | +# Create a Dataset object containing the data |
| 110 | +# -------------------------------------------- |
| 111 | +dset = pymare.Dataset(y=y, X=None, v=v, n=n, add_intercept=True) |
| 112 | + |
| 113 | +# Here is a dictionary to house results across models |
| 114 | +results = {} |
| 115 | + |
| 116 | +############################################################################### |
| 117 | +# Fit models |
| 118 | +# ----------------------------------------------------------------------------- |
| 119 | +# When you have ``z`` or ``p``: |
| 120 | +# |
| 121 | +# - :class:`pymare.estimators.Stouffers` |
| 122 | +# - :class:`pymare.estimators.Fishers` |
| 123 | +# |
| 124 | +# When you have ``y`` and ``v`` and don't want to estimate between-study variance: |
| 125 | +# |
| 126 | +# - :class:`pymare.estimators.WeightedLeastSquares` |
| 127 | +# |
| 128 | +# When you have ``y`` and ``v`` and want to estimate between-study variance: |
| 129 | +# |
| 130 | +# - :class:`pymare.estimators.DerSimonianLaird` |
| 131 | +# - :class:`pymare.estimators.Hedges` |
| 132 | +# - :class:`pymare.estimators.VarianceBasedLikelihoodEstimator` |
| 133 | +# |
| 134 | +# When you have ``y`` and ``n`` and want to estimate between-study variance: |
| 135 | +# |
| 136 | +# - :class:`pymare.estimators.SampleSizeBasedLikelihoodEstimator` |
| 137 | +# |
| 138 | +# When you have ``y`` and ``v`` and want a hierarchical model: |
| 139 | +# |
| 140 | +# - :class:`pymare.estimators.StanMetaRegression` |
| 141 | + |
| 142 | +############################################################################### |
| 143 | +# First, we have "combination models", which combine p and/or z values |
| 144 | +# ````````````````````````````````````````````````````````````````````````````` |
| 145 | +# The two combination models in PyMARE are Stouffer's and Fisher's Tests. |
| 146 | +# |
| 147 | +# Notice that these models don't use :class:`pymare.core.Dataset` objects. |
| 148 | +stouff = pymare.estimators.StoufferCombinationTest() |
| 149 | +stouff.fit(z[:, None]) |
| 150 | +print('Stouffers') |
| 151 | +print('p: {}'.format(stouff.params_["p"])) |
| 152 | +print() |
| 153 | + |
| 154 | +fisher = pymare.estimators.FisherCombinationTest() |
| 155 | +fisher.fit(z[:, None]) |
| 156 | +print('Fishers') |
| 157 | +print('p: {}'.format(fisher.params_["p"])) |
| 158 | + |
| 159 | +############################################################################### |
| 160 | +# Now we have a fixed effects model |
| 161 | +# ````````````````````````````````````````````````````````````````````````````` |
| 162 | +# This estimator does not attempt to estimate between-study variance. |
| 163 | +# Instead, it takes ``tau2`` (:math:`\tau^{2}`) as an argument. |
| 164 | +wls = pymare.estimators.WeightedLeastSquares() |
| 165 | +wls.fit_dataset(dset) |
| 166 | +wls_summary = wls.summary() |
| 167 | +results['Weighted Least Squares'] = wls_summary.to_df() |
| 168 | +print('Weighted Least Squares') |
| 169 | +print(wls_summary.to_df().T) |
| 170 | + |
| 171 | +############################################################################### |
| 172 | +# Methods that estimate between-study variance |
| 173 | +# ````````````````````````````````````````````````````````````````````````````` |
| 174 | +# The ``DerSimonianLaird``, ``Hedges``, and ``VarianceBasedLikelihoodEstimator`` |
| 175 | +# estimators all estimate between-study variance from the data, and use ``y`` |
| 176 | +# and ``v``. |
| 177 | +# |
| 178 | +# ``DerSimonianLaird`` and ``Hedges`` use relatively simple methods for |
| 179 | +# estimating between-study variance, while ``VarianceBasedLikelihoodEstimator`` |
| 180 | +# can use either maximum-likelihood (ML) or restricted maximum-likelihood (REML) |
| 181 | +# to iteratively estimate it. |
| 182 | +dsl = pymare.estimators.DerSimonianLaird() |
| 183 | +dsl.fit_dataset(dset) |
| 184 | +dsl_summary = dsl.summary() |
| 185 | +results['DerSimonian-Laird'] = dsl_summary.to_df() |
| 186 | +print('DerSimonian-Laird') |
| 187 | +print(dsl_summary.to_df().T) |
| 188 | +print() |
| 189 | + |
| 190 | +hedge = pymare.estimators.Hedges() |
| 191 | +hedge.fit_dataset(dset) |
| 192 | +hedge_summary = hedge.summary() |
| 193 | +results['Hedges'] = hedge_summary.to_df() |
| 194 | +print('Hedges') |
| 195 | +print(hedge_summary.to_df().T) |
| 196 | +print() |
| 197 | + |
| 198 | +vb_ml = pymare.estimators.VarianceBasedLikelihoodEstimator(method='ML') |
| 199 | +vb_ml.fit_dataset(dset) |
| 200 | +vb_ml_summary = vb_ml.summary() |
| 201 | +results['Variance-Based with ML'] = vb_ml_summary.to_df() |
| 202 | +print('Variance-Based with ML') |
| 203 | +print(vb_ml_summary.to_df().T) |
| 204 | +print() |
| 205 | + |
| 206 | +vb_reml = pymare.estimators.VarianceBasedLikelihoodEstimator(method='REML') |
| 207 | +vb_reml.fit_dataset(dset) |
| 208 | +vb_reml_summary = vb_reml.summary() |
| 209 | +results['Variance-Based with REML'] = vb_reml_summary.to_df() |
| 210 | +print('Variance-Based with REML') |
| 211 | +print(vb_reml_summary.to_df().T) |
| 212 | +print() |
| 213 | + |
| 214 | +# The ``SampleSizeBasedLikelihoodEstimator`` estimates between-study variance |
| 215 | +# using ``y`` and ``n``, but assumes within-study variance is homogenous |
| 216 | +# across studies. |
| 217 | +sb_ml = pymare.estimators.SampleSizeBasedLikelihoodEstimator(method='ML') |
| 218 | +sb_ml.fit_dataset(dset) |
| 219 | +sb_ml_summary = sb_ml.summary() |
| 220 | +results['Sample Size-Based with ML'] = sb_ml_summary.to_df() |
| 221 | +print('Sample Size-Based with ML') |
| 222 | +print(sb_ml_summary.to_df().T) |
| 223 | +print() |
| 224 | + |
| 225 | +sb_reml = pymare.estimators.SampleSizeBasedLikelihoodEstimator(method='REML') |
| 226 | +sb_reml.fit_dataset(dset) |
| 227 | +sb_reml_summary = sb_reml.summary() |
| 228 | +results['Sample Size-Based with REML'] = sb_reml_summary.to_df() |
| 229 | +print('Sample Size-Based with REML') |
| 230 | +print(sb_reml_summary.to_df().T) |
| 231 | + |
| 232 | +############################################################################### |
| 233 | +# What about the Stan estimator? |
| 234 | +# ````````````````````````````````````````````````````````````````````````````` |
| 235 | +# We're going to skip this one here because of how computationally intensive it |
| 236 | +# is. |
| 237 | + |
| 238 | +############################################################################### |
| 239 | +# Let's check out our results! |
| 240 | +# ````````````````````````````````````````````````````````````````````````````` |
| 241 | +fig, ax = plt.subplots(figsize=(8, 8)) |
| 242 | + |
| 243 | +for i, (estimator_name, summary_df) in enumerate(results.items()): |
| 244 | + ax.scatter((summary_df.loc[0, 'estimate'],), (i+1,), label=estimator_name) |
| 245 | + ax.plot((summary_df.loc[0, 'ci_0.025'], summary_df.loc[0, 'ci_0.975']), |
| 246 | + (i+1, i+1), |
| 247 | + linewidth=3) |
| 248 | + |
| 249 | +# Get 95% CI for true effect |
| 250 | +lower_bound, upper_bound = var_to_ci(TRUE_EFFECT, BETWEEN_STUDY_VAR, 1) |
| 251 | +ax.scatter((TRUE_EFFECT,), (i+2,), label='True Effect') |
| 252 | +ax.plot((lower_bound, upper_bound), (i+2, i+2), |
| 253 | + linewidth=3, label='Between-Study 95% CI') |
| 254 | +ax.set_ylim((0, i+3)) |
| 255 | +ax.set_yticklabels([None] + list(results.keys()) + ['True Effect']) |
| 256 | + |
| 257 | +ax.set_xlabel('Mean (95% CI)') |
| 258 | +fig.tight_layout() |
0 commit comments