RajLabMSSM · jackhump · Feb 23, 2024 · Mar 29, 2024 · Apr 3, 2024 · Apr 11, 2024
diff --git a/COLOC/Snakefile b/COLOC/Snakefile
@@ -5,24 +5,31 @@ gwas_data = config["gwas"]
 qtl_data = config["qtl"]
 outFolder = config["outFolder"]
 geneMeta = config["geneMeta"]
-
+sig_threshold = config["sig_threshold"]
 # sanity check - can the GWAS and QTL data be found in the database?
-gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2)
-qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1)
+gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1)
+qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2)
 
 gwas_check = all(i in gwas_df["dataset"].tolist() for i in gwas_data )
 qtl_check = all(i in qtl_df["dataset"].tolist() for i in qtl_data )
 
-if not all([gwas_check, qtl_check]):
-    print(" * QTL and/or GWAS cannot be found in database")
+if not all([gwas_check]):
+    print(" * GWAS cannot be found in database")
+    sys.exit()
+
+if not all([qtl_check]):
+    print(" * QTL cannot be found in database")
+    print(qtl_check)
     sys.exit()
 
+
 shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;")
 rule all:
     input:
         #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data)
         outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
         outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz",
+        #outFolder + "pairwise_QTL_results.tsv.gz"
         #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
         #outFolder + "all_COLOC_summary_results.tsv.gz"
 
@@ -34,8 +41,8 @@ rule run_COLOC:
         script = "scripts/run_COLOC.R"
     shell:
         "mkdir -p {outFolder}{wildcards.GWAS};"
-        "ml R/4.0.3;" #ml arrow;"
-        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ "
+        "ml R;" #ml arrow;"
+        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ -s 1e-5 --verbose --lowmem"
 
 rule merge_COLOC_snp_level:
     input:
@@ -46,7 +53,7 @@ rule merge_COLOC_snp_level:
     params:
         script = "scripts/merge_COLOC_results.R"
     shell:
-        "ml R/4.0.3;"
+        "ml R;" # LDLINKR only installed on 3.6
         "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" 
         "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} "
 
@@ -61,3 +68,25 @@ rule create_LD_tables:
     shell:
        "conda activate echoR;"
        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" 
+
+rule prepare_QTL_COLOC:
+    input:
+        outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"
+    output:
+        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
+    params:
+        script = "scripts/prepare_pairwise_qtl_coloc.R"
+    shell:
+        "ml R/4.2.0;"
+        "Rscript {params.script} -i {input} -o {output} -m across -p 0.8"
+
+rule run_QTL_COLOC:
+    input:
+        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
+    output:
+        outFolder + "pairwise_QTL_results.tsv.gz"
+    params:
+        script = "scripts/run_COLOC.R"
+    shell:
+        "ml R/4.2.0;"
+        "Rscript {params.script} -t {input} -o {output}"
diff --git a/COLOC/cluster.yaml b/COLOC/cluster.yaml
@@ -1,10 +1,13 @@
 __default__:
-  #partition: chimera
   queue: premium
   cores: 1
-  mem: 24000
+  mem: 48000
   time: '120'
   name: $(basename $(pwd)):{rule}:{wildcards}
   output: logs/{rule}:{wildcards}.stdout
   error: logs/{rule}:{wildcards}.stderr
   himem: ""
+pairwise_QTL_COLOC:
+  cores: 4
+  mem: 16000
+  time: 8640
diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk
@@ -0,0 +1,31 @@
+import sys
+import pandas as pd
+import glob
+import re
+import os
+
+# Pairwise QTL COLOC pipeline
+# Jack Humphrey
+outFolder = config["outFolder"]
+inFolder = config["inFolder"]
+MAF_file = config["MAF_file"]
+
+target_files = [ x for x in glob.glob(inFolder + "*genewide*pairs*tsv.gz" )]
+target_ids = [os.path.basename(re.sub('.tsv.gz', '', x, count=1)) for x in target_files ]
+
+
+shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;")
+rule all:
+    input:
+        expand( outFolder + "{target}_COLOC.tsv.gz", target = target_ids)
+
+rule pairwise_QTL_COLOC:
+    input:
+        inFolder + "{target}.tsv.gz"
+    output:
+        outFolder + "{target}_COLOC.tsv.gz"
+    params:
+        script = "scripts/run_COLOC.R"
+    shell:
+        "ml R;" 
+        "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1 --MAF {MAF_file}"
diff --git a/COLOC/scripts/prepare_pairwise_qtl_coloc.R b/COLOC/scripts/prepare_pairwise_qtl_coloc.R
@@ -0,0 +1,73 @@
+library(optparse)
+
+option_list <- list(
+        make_option(c('-o', '--outFile'), help='the path to the output file', default = ""),
+        make_option(c('-i', '--inFile'), help='the path to the input file', default = ""),
+        make_option(c('-m', '--mode'), help='mode - across or within?', default = "across"),
+        make_option(c('-p', '--pp'), help='min PP4', default = 0.5),
+        make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE)
+)
+
+option.parser <- OptionParser(option_list=option_list)
+opt <- parse_args(option.parser)
+
+inFile <- opt$inFile
+outFile <- opt$outFile
+mode <- opt$mode
+PP4 <- opt$pp
+library(tidyverse)
+
+#all_coloc <- read_tsv("../all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz")
+all_coloc <- read_tsv(inFile)
+
+all_coloc$distance <- abs(all_coloc$GWAS_pos - all_coloc$QTL_pos)
+all_coloc$distance_filter <- (!is.na(all_coloc$LD) & all_coloc$LD > 0.1) | all_coloc$distance < 5e5
+
+qtl_coloc_input <- all_coloc %>%
+  filter(distance_filter == TRUE & PP.H4.abf > PP4) %>%
+  select(GWAS, QTL, locus, feature) %>%
+  distinct()
+
+message(" * ", nrow(qtl_coloc_input), " phenotypes colocalize at PP4 > ", PP4)
+stopifnot( nrow(qtl_coloc_input) > 0)
+
+
+shared_loci <- select(qtl_coloc_input, GWAS, locus, QTL) %>%
+  distinct() %>%
+  group_by(GWAS, locus) %>%
+  tally() %>%
+  filter(n > 1)
+message(" * ", nrow(shared_loci), " loci are shared between QTL types")
+stopifnot( nrow(shared_loci) > 0)
+
+qtl_coloc_input  <- filter(qtl_coloc_input, locus %in% shared_loci$locus)
+
+# get all pairwise comparisons required
+# make optional - do you want to test features within a dataset or just across?
+qtl_coloc_split <- qtl_coloc_input %>%
+  split(paste0(.$GWAS, ":", .$locus)) %>%
+  map_df( ~{
+    d <- select(.x, GWAS,locus, QTL, feature) %>% distinct() 
+    qtl_pairs <- combn(unique(paste(d$QTL, d$feature) ), 2)
+    locus_df <- select(d, GWAS, locus)
+    pair_df <- 
+      data.frame(qtl_1 = qtl_pairs[1,], qtl_2 = qtl_pairs[2,] ) %>%
+      tidyr::separate(qtl_1, into = c("qtl_1", "feature_1"), sep = " ") %>%
+      tidyr::separate(qtl_2, into = c("qtl_2", "feature_2"), sep = " ") %>%
+      mutate(GWAS = unique(d$GWAS), locus = unique(d$locus)) %>%
+      select(GWAS, locus, everything() )
+
+  }) %>%
+  distinct()
+
+if( mode == "across"){
+
+qtl_coloc_split <- qtl_coloc_split %>%
+ mutate( within = qtl_1 == qtl_2) %>% filter(within == FALSE)
+}
+
+write_tsv(qtl_coloc_split, outFile) #"test_eQTL_edQTL_pairwise_input.tsv")
+
+
+
+