Google-Health
diff --git a/‎nonlinear-covariate-gwas/DeepNull_e2e.ipynb‎
Lines changed: 72 additions & 30 deletions b/‎nonlinear-covariate-gwas/DeepNull_e2e.ipynb‎
Lines changed: 72 additions & 30 deletions
diff --git a/‎nonlinear-covariate-gwas/README.md‎
Lines changed: 66 additions & 0 deletions b/‎nonlinear-covariate-gwas/README.md‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎nonlinear-covariate-gwas/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎nonlinear-covariate-gwas/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nonlinear-covariate-gwas/config.py‎
Lines changed: 115 additions & 0 deletions b/‎nonlinear-covariate-gwas/config.py‎
Lines changed: 115 additions & 0 deletions
@@ -5,8 +5,7 @@
     "colab": {
       "name": "DeepNull_e2e.ipynb",
       "provenance": [],
-      "collapsed_sections": [],
-      "toc_visible": true
+      "collapsed_sections": []
     },
     "kernelspec": {
       "display_name": "Python 3",
@@ -125,9 +124,11 @@
         "import numpy as np\n",
         "import pandas as pd\n",
         "import seaborn as sns\n",
+        "from sklearn import metrics as skmetrics\n",
         "import tensorflow as tf\n",
         "from typing import Dict, List\n",
         "\n",
+        "from deepnull import config\n",
         "from deepnull import data\n",
         "from deepnull import metrics as metrics_lib\n",
         "from deepnull import model as model_lib\n",
@@ -214,17 +215,16 @@
       },
       "source": [
         "# These are the parameters used in Hormozdiari et al 2021. The definition of\n",
-        "# each parameter is given in the ModelParameters class in deepnull/model.py.\n",
-        "# Edit directly to change.\n",
-        "model_params = model_lib.ModelParameters(\n",
-        "    mlp_units=[64, 64, 32, 16],\n",
-        "    mlp_activation='relu',\n",
-        "    learning_rate_batch_1024=1e-4,\n",
-        "    beta_1=0.9,\n",
-        "    beta_2=0.99,\n",
-        "    num_epochs=1000,\n",
-        "    batch_size=1024\n",
-        ")"
+        "# each parameter is given in the config class in deepnull/config.py. Note that\n",
+        "# XGBoost models are also available by specifying config.XGBOOST.\n",
+        "full_config = config.get_config(config.DEEPNULL)\n",
+        "\n",
+        "# These parameters can be edited directly like in the following statement. Here\n",
+        "# we train for many fewer epochs than a typical run so that the colab finishes\n",
+        "# quickly. Note that this will likely cause the following cell to complain that\n",
+        "# there is poor performance across data folds, since the model folds do not\n",
+        "# converge.\n",
+        "full_config.training_config.num_epochs = 2"
       ],
       "execution_count": null,
       "outputs": []
@@ -263,14 +263,14 @@
         "    target=target_phenotype,\n",
         "    target_is_binary=target_is_binary,\n",
         "    covariates=covariates,\n",
+        "    full_config=full_config,\n",
         "    prediction_column=output_column_name,\n",
         "    num_folds=num_folds,\n",
-        "    model_params=model_params,\n",
         "    seed=random_seed,\n",
         "    # Where temporary outputs will be written.\n",
         "    logdir='/content/deepnull',\n",
         "    verbosity=1)\n",
-        "output_df, histories, validation_performance, test_perf_df = outputs\n",
+        "output_df, validation_performance, test_perf_df = outputs\n",
         "\n",
         "if not metrics_lib.acceptable_model_performance(validation_performance):\n",
         "  print('\\n\\n##### Warning!! #####')\n",
@@ -315,23 +315,58 @@
         "id": "ke0YDoKy7QE3"
       },
       "source": [
-        "def plot_model_performance(validation_summary_stats: List[Dict[str, float]],\n",
-        "                           test_performance_df: pd.DataFrame,\n",
-        "                           x: str,\n",
-        "                           y: str):\n",
+        "def plot_binary_model_performance(\n",
+        "    validation_summary_stats: List[Dict[str, float]],\n",
+        "    test_performance_df: pd.DataFrame,\n",
+        "    label_col: str,\n",
+        "    prediction_col: str):\n",
+        "  \"\"\"Plots performance for binary traits.\"\"\"\n",
+        "  num_folds = len(validation_summary_stats)\n",
+        "  fig, axs = plt.subplots(1, num_folds, figsize=(num_folds * 4, 5),\n",
+        "                          sharex=True, sharey=True)\n",
+        "  fold_column = f'{label_col}_deepnull_eval_fold'\n",
+        "  for fold, val_performance in enumerate(validation_summary_stats):\n",
+        "    fold_mask = test_performance_df[fold_column] == fold\n",
+        "    test_fold_df = test_performance_df[fold_mask]\n",
+        "    ax = axs[fold]\n",
+        "    sns.regplot(data=test_fold_df, x=prediction_col, y=label_col, ax=ax,\n",
+        "                logistic=True, scatter_kws={'alpha': 0.5})\n",
+        "    # DeepNull and XGBoost name their equivalent metrics slightly differently.\n",
+        "    val_auroc = val_performance.get('auroc') or val_performance.get('auc')\n",
+        "    val_auprc = val_performance.get('auprc') or val_performance.get('aucpr')\n",
+        "    test_auroc = skmetrics.roc_auc_score(test_fold_df[label_col],\n",
+        "                                         test_fold_df[prediction_col])\n",
+        "    test_auprc = skmetrics.average_precision_score(test_fold_df[label_col],\n",
+        "                                                   test_fold_df[prediction_col])\n",
+        "    ax.set_title(f'Fold {fold}\\n'\n",
+        "                 f'Validation AUROC: {val_auroc:.2f}\\n'\n",
+        "                 f'Validation AUPRC: {val_auprc:.2f}\\n'\n",
+        "                 f'Test AUROC: {test_auroc:.2f}\\n'\n",
+        "                 f'Test AUPRC: {test_auprc:.2f}')\n",
+        "  plt.tight_layout()\n",
+        "\n",
+        "\n",
+        "def plot_quantitative_model_performance(\n",
+        "    validation_summary_stats: List[Dict[str, float]],\n",
+        "    test_performance_df: pd.DataFrame,\n",
+        "    label_col: str,\n",
+        "    prediction_col: str):\n",
+        "  \"\"\"Plots performance for quantitative traits.\"\"\"\n",
         "  num_folds = len(validation_summary_stats)\n",
         "  fig, axs = plt.subplots(1, num_folds, figsize=(num_folds * 4, 5),\n",
         "                          sharex=True, sharey=True)\n",
-        "  fold_column = f'{x}_deepnull_eval_fold'\n",
+        "  fold_column = f'{label_col}_deepnull_eval_fold'\n",
         "  for fold, val_performance in enumerate(validation_summary_stats):\n",
         "    fold_mask = test_performance_df[fold_column] == fold\n",
         "    test_fold_df = test_performance_df[fold_mask]\n",
         "    ax = axs[fold]\n",
-        "    sns.regplot(data=test_fold_df, x=x, y=y, ax=ax, scatter_kws={'alpha': 0.5})\n",
-        "    val_mse = val_performance['mse']\n",
-        "    val_corr = val_performance['tf_pearson']\n",
-        "    test_mse = np.square(test_fold_df[x] - test_fold_df[y]).mean()\n",
-        "    test_corr = np.corrcoef(test_fold_df[x], test_fold_df[y])[0, 1]\n",
+        "    sns.regplot(data=test_fold_df, x=prediction_col, y=label_col, ax=ax,\n",
+        "                scatter_kws={'alpha': 0.5})\n",
+        "    # DeepNull and XGBoost name their equivalent metrics slightly differently.\n",
+        "    val_mse = val_performance.get('mse') or val_performance.get('rmse')**2\n",
+        "    val_corr = val_performance.get('tf_pearson') or val_performance.get('pearson')\n",
+        "    test_mse = np.square(test_fold_df[label_col] - test_fold_df[prediction_col]).mean()\n",
+        "    test_corr = np.corrcoef(test_fold_df[label_col], test_fold_df[prediction_col])[0, 1]\n",
         "    ax.set_title(f'Fold {fold}\\n'\n",
         "                 f'Validation MSE: {val_mse:.2f}\\n'\n",
         "                 f'Validation Pearson R: {val_corr:.2f}\\n'\n",
@@ -348,11 +383,18 @@
         "id": "3w2h1z5E-snM"
       },
       "source": [
-        "if not target_is_binary:\n",
-        "  plot_model_performance(validation_summary_stats=validation_performance,\n",
-        "                         test_performance_df=test_perf_df,\n",
-        "                         x=target_phenotype,\n",
-        "                         y=output_column_name)"
+        "if target_is_binary:\n",
+        "  plot_binary_model_performance(\n",
+        "      validation_summary_stats=validation_performance,\n",
+        "      test_performance_df=test_perf_df,\n",
+        "      label_col=target_phenotype,\n",
+        "      prediction_col=output_column_name)\n",
+        "else:\n",
+        "  plot_quantitative_model_performance(\n",
+        "      validation_summary_stats=validation_performance,\n",
+        "      test_performance_df=test_perf_df,\n",
+        "      label_col=target_phenotype,\n",
+        "      prediction_col=output_column_name)"
       ],
       "execution_count": null,
       "outputs": []
 
@@ -52,6 +52,72 @@ To see all available flags, run
 python -m deepnull.main --help 2> /dev/null
 ```
 
+Of particular note is the `--model_config` flag. DeepNull uses the
+[ml_collections](https://github.com/google/ml_collections) library to specify
+all parameters related to the model and training regimen. The supported
+configuration code is located in [`config.py`](config.py), and parameters can
+be modified as described in detail in the
+[`ml_collections README`](https://github.com/google/ml_collections#parameterising-the-get_config-function).
+As a brief example, to use the DeepNull architecture with the `elu` activation
+and train with batch size 4096, the above example command would be modified as
+follows:
+
+```bash
+python -m deepnull.main \
+  --input_tsv=/input/ORIGINAL_PHENOCOVAR_TSV \
+  --output_tsv=/output/PHENOCOVAR_WITH_DEEPNULL_PREDICTION_TSV \
+  --target=pheno \
+  --covariates="age,sex,genotyping_array" \
+  --model_config=/path/to/config.py:deepnull \
+  --model_config.model_config.mlp_activation=elu \
+  --model_config.training_config.batch_size=4096
+```
+
+where `/path/to/config.py` provides the path to [`config.py`](config.py) on your
+machine.
+
+## Incorporating DeepNull into a GWAS analysis
+
+The above section, "How to run DeepNull", shows that the DeepNull software adds
+a single column to a phenotype+covariate file of interest that represents a
+nonlinear prediction of the target phenotype of interest. To incorporate this
+into a GWAS analysis, the single additional covariate should be **added** as an
+additional covariate. A concrete example with `BOLT-LMM`, using the same file,
+phenotype `pheno`, and covariates `age`, `sex`, `genotyping_array` as above, is
+shown below:
+
+### Original example GWAS command
+```bash
+# N.B. Data loading flags are omitted for brevity.
+
+bolt \
+  --phenoFile /input/ORIGINAL_PHENOCOVAR_TSV \
+  --covarFile /input/ORIGINAL_PHENOCOVAR_TSV \
+  --qCovarCol age \
+  --qCovarCol sex \
+  --qCovarCol genotyping_array \
+  --phenoCol pheno
+```
+
+After running DeepNull on the `/input/ORIGINAL_PHENOCOVAR_TSV` to create the new
+TSV `/output/PHENOCOVAR_WITH_DEEPNULL_PREDICTION_TSV` that includes the column
+`pheno_deepnull`, the updated command is given below:
+
+### Updated GWAS command to incorporate DeepNull
+```bash
+# N.B. Data loading flags are omitted for brevity.
+# Note the addition of the single `--qCovarCol pheno_deepnull` line.
+
+bolt \
+  --phenoFile /output/PHENOCOVAR_WITH_DEEPNULL_PREDICTION_TSV \
+  --covarFile /output/PHENOCOVAR_WITH_DEEPNULL_PREDICTION_TSV \
+  --qCovarCol age \
+  --qCovarCol sex \
+  --qCovarCol genotyping_array \
+  --qCovarCol pheno_deepnull \
+  --phenoCol pheno
+```
+
 ## Data
 
 Datasets used to reproduce the results from the above publication are available
 
@@ -26,4 +26,4 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """DeepNull."""
 
-__version__ = '0.1.3'
+__version__ = '0.2.0'
@@ -0,0 +1,115 @@
+# Copyright 2021 Google LLC.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Configuration for all model types.
+
+This configuration file is used to specify all different supported types of
+models for training DeepNull. The configuration is parsed by model.py for the
+proper instantiation of the selected model.
+
+See https://github.com/google/ml_collections for details on ConfigDict.
+"""
+import ml_collections
+
+# Valid model types.
+# The model used for the main figures in the paper.
+DEEPNULL = 'deepnull'
+# XGBoost-based models.
+XGBOOST = 'xgboost'
+
+
+def get_config(config_name: str) -> ml_collections.ConfigDict:
+  """Returns the config specified by `config_name`."""
+  supported_models = {
+      DEEPNULL:
+          ml_collections.ConfigDict({
+              'model_type':
+                  DEEPNULL,
+              'model_config':
+                  ml_collections.ConfigDict({
+                      # The MLP units for the nonlinear path of DeepNull.
+                      'mlp_units': (64, 64, 32, 16),
+                      # The activation function to use. See
+                      # https://keras.io/api/layers/activations.
+                      'mlp_activation': 'relu',
+                  }),
+              'optimizer_config':
+                  ml_collections.ConfigDict({
+                      # Learning rate for a batch size of 1024. The actual
+                      # learning rate used is scaled linearly as
+                      # `learning_rate * batch_size / 1024`.
+                      'learning_rate_batch_1024': 1e-4,
+                      # Betas for the Adam optimizer.
+                      'beta_1': 0.9,
+                      'beta_2': 0.99,
+                      # The optimization metric to use to select the best model
+                      # checkpoint. This must be a metric generated during
+                      # training (which depends on whether the target is a
+                      # binary or continuous variable). If unspecified, the
+                      # default metric for the associated target type is used.
+                      'optimization_metric': '',
+                  }),
+              'training_config':
+                  ml_collections.ConfigDict({
+                      # Number of full passes through the training data.
+                      'num_epochs': 1000,
+                      # Number of training examples per batch.
+                      'batch_size': 1024,
+                  }),
+          }),
+      XGBOOST:
+          ml_collections.ConfigDict({
+              'model_type':
+                  XGBOOST,
+              'model_config':
+                  ml_collections.ConfigDict({
+                      # See
+                      # https://xgboost.readthedocs.io/en/latest/parameter.html
+                      # for full details on all parameters.
+                      # The target objective. If unspecified, will be the
+                      # default objective for the type of model prediction (i.e.
+                      # regression vs classification).
+                      'objective': '',
+                      'max_depth': 3,
+                      'eta': 0.32,
+                      'alpha': 0.658,
+                      'lambda': 2.0,
+                      # If unspecified, will be the default metric for the type
+                      # of model prediction.
+                      'eval_metric': '',
+                  }),
+              'training_config':
+                  ml_collections.ConfigDict({
+                      'num_boost_round': 25,
+                  }),
+          }),
+  }
+
+  if config_name not in supported_models:
+    raise ValueError(f'Config "{config_name}" is not a supported model: '
+                     f'{sorted(supported_models)}')
+
+  return supported_models[config_name]