From 509ef85bc0d6f67cceec1dd57d2cc554b9d8a8ad Mon Sep 17 00:00:00 2001
From: Jack Humphrey <jack.humphrey@mssm.edu>
Date: Fri, 23 Feb 2024 16:10:04 -0500
Subject: [PATCH 1/7] added QTL-QTL coloc - still fixing bugs

---
 COLOC/scripts/run_COLOC.R | 203 ++++++++++++++++++++++----------------
 1 file changed, 119 insertions(+), 84 deletions(-)

diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R
index 69bc401..19b6f96 100644
--- a/COLOC/scripts/run_COLOC.R
+++ b/COLOC/scripts/run_COLOC.R
@@ -455,7 +455,44 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){
     return(res_split)
 }
 
-       
+# for each gene extract top QTL SNP   
+getQTLinfo <- function(q, hit_info){  
+    if( all( c("beta", "varbeta") %in% names(q[[1]]) ) ){
+        qtl_info <- q %>% 
+        map_df( ~{ 
+            as.data.frame(.x, stringsAsFactors=FALSE) %>% 
+            arrange(pvalues) %>% head(1) %>% 
+            select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_Beta = beta, QTL_SE = varbeta, QTL_MAF = MAF, QTL_chr, QTL_pos) %>%
+            mutate( QTL_SE = sqrt(QTL_SE) ) # get SE back from Variance
+        })
+        # get out beta, se and P for GWAS SNP 
+        gwas_info <- q %>%
+         map_df( ~{
+            df <- as.data.frame(.x, stringsAsFactors=FALSE) %>%
+                filter(snp == hit_info$GWAS_SNP )
+            if( nrow(df) == 1){
+                df %>%
+                select(GWAS_SNP_Beta = beta,
+                       GWAS_SNP_SE = varbeta,
+                       GWAS_SNP_P = pvalues ) %>%
+                mutate(GWAS_SNP_SE = sqrt(GWAS_SNP_SE) )
+            }else{
+            tibble(GWAS_SNP_Beta = NA, GWAS_SNP_SE = NA, GWAS_SNP_P = NA)
+            }
+        }) 
+        qtl_info <- bind_cols(qtl_info, gwas_info)
+        print(head(qtl_info) )
+    }else{
+        qtl_info <- q %>% 
+        map_df( ~{ 
+            as.data.frame(.x, stringsAsFactors=FALSE) %>% 
+            arrange(pvalues) %>% head(1) %>% 
+            select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_MAF = MAF, QTL_chr, QTL_pos)
+        }) 
+
+    }
+    return(qtl_info)
+}      
 
 
 # running COLOC between a GWAS locus and all eQTLs within 1MB either side
@@ -467,7 +504,8 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){
 # output a list of objects:
 ## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value
 ## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting.
-runCOLOC <- function(gwas, qtl, hit){
+## sig.level now filters out QTLs with P > sig.level
+runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){
     message(" * Analysing LOCUS: ", hit$locus )
     # hit is a dataframe containing "snp", "chr", "pos", "locus"
     
@@ -479,24 +517,12 @@ runCOLOC <- function(gwas, qtl, hit){
     g <- extractGWAS(gwas, hit_range)
     # get hit info from GWAS
     hit_snp <- hit$snp
-    # this assumes that the hit SNP is within the g object
-    # if the top hit doesn't have an RSID or cannot be MAF matched then this is violated
-    # I will have to use the hit object instead and accept that not all top_loci lists include the MAF, P value etc.
-    #if( hit_snp == hit$snp ){
-    #hit_info <- as.data.frame(g, stringsAsFactors = FALSE) %>% filter(snp == hit_snp) %>% 
-    #            select(GWAS_SNP = snp, GWAS_P = pvalues, GWAS_effect_size = beta, GWAS_MAF = MAF) %>%
-    #            mutate(locus = hit$locus) %>% select(locus, everything() )
-    #}else{
-    # now just use top loci
-   
+  
     # problem when no p column in hit!
 
     hit_info <- hit %>%
             select(locus, GWAS_SNP = snp, GWAS_P = p)
-    #}
- 
     # if GWAS is hg19 then lift over to hg38
-    # use the lifted over position in hit_info
     if(gwas$build != qtl$build){
         qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build)
     }else{
@@ -505,57 +531,39 @@ runCOLOC <- function(gwas, qtl, hit){
     # record the position of the GWAS top SNP
     hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr
     hit_info$GWAS_pos <- splitCoords(qtl_coord)$end
-
-
     qtl_range <- joinCoords(qtl_coord, flank = 1e6)
     
     message("       * extracting QTLs")
     q <- extractQTL(qtl, qtl_range)
     print(names(q[[1]]))
     if( is.null(q) ){ return(NULL) }
-    # for each gene extract top QTL SNP   
-    if( all( c("beta", "varbeta") %in% names(q[[1]]) ) ){
-        qtl_info <- q %>% 
-        map_df( ~{ 
-            as.data.frame(.x, stringsAsFactors=FALSE) %>% 
-            arrange(pvalues) %>% head(1) %>% 
-            select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_Beta = beta, QTL_SE = varbeta, QTL_MAF = MAF, QTL_chr, QTL_pos) %>%
-            mutate( QTL_SE = sqrt(QTL_SE) ) # get SE back from Variance
-        })
-        # get out beta, se and P for GWAS SNP 
-        gwas_info <- q %>%
-         map_df( ~{
-            df <- as.data.frame(.x, stringsAsFactors=FALSE) %>%
-                filter(snp == hit_info$GWAS_SNP )
-            if( nrow(df) == 1){
-                df %>%
-                select(GWAS_SNP_Beta = beta,
-                       GWAS_SNP_SE = varbeta,
-                       GWAS_SNP_P = pvalues ) %>%
-                mutate(GWAS_SNP_SE = sqrt(GWAS_SNP_SE) )
-            }else{
-            tibble(GWAS_SNP_Beta = NA, GWAS_SNP_SE = NA, GWAS_SNP_P = NA)
-            }
-        }) 
-        qtl_info <- bind_cols(qtl_info, gwas_info)
-        print(head(qtl_info) )
-    }else{
-        qtl_info <- q %>% 
-        map_df( ~{ 
-            as.data.frame(.x, stringsAsFactors=FALSE) %>% 
-            arrange(pvalues) %>% head(1) %>% 
-            select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_MAF = MAF, QTL_chr, QTL_pos)
-        }) 
+    qtl_info <- getQTLinfo(q, hit_info)
+     # if sig filtering requested
+    if( !is.null(sig.level ) ){
+        qtl_info <- filter(qtl_info, QTL_P < sig.level)
+        q <- q[ qtl_info$gene ] 
+         message(" * ", length(q), " features in QTL")
+    }
 
+    if( !is.null(qtl2) ){
+        # if second QTL dataset requested
+        q2 <- extractQTL(qtl2, qtl_range)
+        if( is.null(q2) ){ return(NULL) }
+        qtl2_info <- getQTLinfo(q2, hit_info)
+        if( !is.null(sig.level ) ){
+            qtl2_info <- filter(qtl2_info, QTL_P < sig.level)
+            q2 <- q2[ qtl2_info$gene ]
+        }
+        message(" * ", length(q2), " features in QTL 2")
     }
     # actually run COLOC
     message("      * running COLOC")
-    
-    # run COLOC on each QTL gene 
     # return the results table and an object containing g and q
+    # GWAS-QTL COLOC
+    if( is.null(qtl2) ){
+    message(" * GWAS-QTL COLOC")
     coloc_res <- 
         purrr::map( q, ~{
-            # if QTL has no overlapping SNPs with GWAS summary then return NULL
             if( length(intersect(g$snp, .x$snp) ) == 0 ){ 
                 message("Hold up! GWAS and QTL have no SNPs in common")
                 return(NULL) 
@@ -564,10 +572,41 @@ runCOLOC <- function(gwas, qtl, hit){
             # add in g and q to coloc object
             gq <- dplyr::inner_join(as.data.frame(g),as.data.frame(.x), by = "snp", suffix = c(".gwas", ".qtl") )
             coloc_object$results <- dplyr::inner_join(coloc_object$results, gq, by = "snp" )
-            # pull out coloc summary as data.frame
             coloc_df <- as.data.frame(t(coloc_object$summary), stringsAsFactors = FALSE)
             return( list(df = coloc_df, object = coloc_object) )
         })
+    }else{
+    message(" * QTL-QTL COLOC")
+    # remove features with lead SNPs > 1e-5
+    # test all pairs of QTL features in the locus
+    qtl_combos <- expand.grid(1:length(q), 1:length(q2) )
+    message(" * testing ", nrow(qtl_combos), " pairs of QTLs")
+    # qtl_info is matched on to COLOC results using gene names - but for QTL-QTL there are two genes, so instead use the index of the pair
+    qtl_info <- 
+        full_join( 
+            qtl_info[qtl_combos$Var1,] %>% mutate(index = 1:nrow(qtl_combos) ), 
+            qtl2_info[qtl_combos$Var2,] %>% mutate(index = 1:nrow(qtl_combos) ), 
+            by = "index", 
+            suffix = c(".qtl1", ".qtl2") ) %>%
+            dplyr::mutate(gene = as.character(index)) %>%
+            select(-index)
+
+    coloc_res <- 
+        purrr::map( 1:nrow(qtl_combos), ~{
+            q_1 = q[[ qtl_combos[.x, "Var1"] ]]
+            q_2 = q2[[ qtl_combos[.x, "Var2"] ]]
+            message( " * ", .x, ": ", unique(q_1$gene), " vs ",  unique(q_2$gene) )
+             if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){
+                message("Hold up! GWAS and QTL have no SNPs in common")
+                return(NULL)
+            }
+            coloc_object <- coloc.abf(dataset1 = q_1, dataset2 = q_2)
+            qq <- dplyr::inner_join(as.data.frame(q_1),as.data.frame(q_2), by = "snp", suffix = c(".qtl1", ".qtl2") )
+            coloc_object$results <- dplyr::inner_join(coloc_object$results, qq, by = "snp" )
+            coloc_df <- as.data.frame(t(coloc_object$summary), stringsAsFactors = FALSE)
+            return( list(df = coloc_df, object = coloc_object) )
+        })
+    }
     if( is.null(coloc_res) ){ return(NULL) }
 
     message("       * COLOC finished")
@@ -578,7 +617,8 @@ runCOLOC <- function(gwas, qtl, hit){
     # bind coloc_res to other tables
     full_res <- full_join(qtl_info, coloc_df, by = "gene")
     full_res <- full_join(hit_info, full_res, by = "locus")
-
+    # remove rows with no coloc run
+    full_res <- filter(full_res, !is.na(PP.H4.abf) )
     # add pairwise LD between GWAS and QTL SNP
     #full_res <- calc_LD(full_res)
 
@@ -618,6 +658,8 @@ option_list <- list(
         make_option(c('-o', '--outFolder'), help='the path to the output file', default = ""),
         make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database" ),
         make_option(c('--qtl', '-q'), help = "the dataset ID for a QTL dataset in the GWAS/QTL database"),
+        make_option(c('--qtl2', '-r'), help = "a second QTL dataset - using this will trigger QTL-QTL COLOC", default = NULL),
+        make_option(c('--sig', '-s'), help = "the minimum p-value a QTL should have for testing", default = NULL),
         make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE)
 )
 
@@ -628,50 +670,41 @@ opt <- parse_args(option.parser)
 outFolder <- opt$outFolder
 gwas_dataset <- opt$gwas
 qtl_dataset <- opt$qtl
+qtl_dataset_2 <- opt$qtl2
 debug <- opt$debug
-#gwas_prefix <- "/sc/arion/projects/als-omics/ALS_GWAS/Nicolas_2018/processed/Nicolas_2018_processed_"
-#qtl_prefix <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/QTL-mapping-pipeline/results/LumbarSpinalCord_expression/peer30/LumbarSpinalCord_expression_peer30"
-#qtl_prefix <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/QTL-mapping-pipeline/results/LumbarSpinalCord_splicing/peer20/LumbarSpinalCord_splicing_peer20"
+sig <- opt$sig
 
-#hits_file <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/downstream-QTL/COLOC/Nicolas_2018/Nicolas_2018_hits_1e-7.tsv"
-#outFile <- "test_coloc_results.tsv"
-
-#options(echo = TRUE)
-
-# load in data
-# extract top GWAS loci
-# for each locus run COLOC
-maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz"
+#maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz"
 maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz"
 
-# load in MAF table
-if( !exists("maf_1000g")){
-
-        maf_1000g <- loadMAF(maf_1000gp3)
-        # load in liftover chain
-        chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain")    
-    }
 
 main <- function(){
-    
-        #gwas_dataset <- "Kunkle_2019"
-    #gwas_dataset <- "Nalls23andMe_2019"
     #gwas_dataset <- "Marioni_2018"
     #qtl_dataset <- "Microglia_THA"
     
     gwas <- pullData(gwas_dataset, "GWAS")
     qtl <- pullData(qtl_dataset, "QTL")
     
+    if( !is.null(qtl_dataset_2) ){
+        qtl2 <- pullData(qtl_dataset_2, "QTL")
+    }
+    
     top_loci <- extractLoci(gwas)
     
-    # for testing
-    #top_loci <- top_loci[2,]
+
+    # load in MAF table
+    if( !exists("maf_1000g")){
+
+        maf_1000g <- loadMAF(maf_1000gp3)
+        # load in liftover chain
+        chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain")    
+    }
+
     if( debug == TRUE){ save.image("debug.RData") }
  
-     
     all_coloc <- purrr::map(
         1:nrow(top_loci), ~{
-            res <- runCOLOC(gwas, qtl, hit = top_loci[.x,])
+            res <- runCOLOC(gwas, qtl, qtl2, hit = top_loci[.x,], sig.level = sig)
             if(is.null(res) ){return(NULL) }
             return(res)
         }
@@ -679,12 +712,13 @@ main <- function(){
     all_res <- map_df(all_coloc, "full_res")
     all_obj <- map(all_coloc, "coloc_object")    
     names(all_obj) <- top_loci$locus
-    
-    
-    # arrange by H4
     all_res <- arrange(all_res, desc(PP.H4.abf)  )
     
     outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv")
+    if( !is.null(qtl_dataset_2)){
+        outFile <- paste0(outFolder, qtl_dataset, "_", qtl_dataset_2, "_", gwas_dataset, "_COLOC.tsv")
+    }    
+
     readr::write_tsv(all_res, path = outFile)
     # save COLOC objects for plotting
     save(all_obj, file = gsub("tsv", "RData", outFile))
@@ -692,3 +726,4 @@ main <- function(){
 
 main()
 
+

From c33e39eeb8cf009e147f275b30b25acf07cfae98 Mon Sep 17 00:00:00 2001
From: Jack Humphrey <jack.humphrey@mssm.edu>
Date: Fri, 29 Mar 2024 18:48:31 -0400
Subject: [PATCH 2/7] added qtl coloc to the snakemake pipeline

---
 COLOC/Snakefile                            |  39 +++++-
 COLOC/scripts/prepare_pairwise_qtl_coloc.R |  73 +++++++++++
 COLOC/scripts/run_COLOC.R                  | 146 +++++++++++++--------
 3 files changed, 199 insertions(+), 59 deletions(-)
 create mode 100644 COLOC/scripts/prepare_pairwise_qtl_coloc.R

diff --git a/COLOC/Snakefile b/COLOC/Snakefile
index a503f63..74284c9 100644
--- a/COLOC/Snakefile
+++ b/COLOC/Snakefile
@@ -7,22 +7,29 @@ outFolder = config["outFolder"]
 geneMeta = config["geneMeta"]
 
 # sanity check - can the GWAS and QTL data be found in the database?
-gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2)
-qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1)
+gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1)
+qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2)
 
 gwas_check = all(i in gwas_df["dataset"].tolist() for i in gwas_data )
 qtl_check = all(i in qtl_df["dataset"].tolist() for i in qtl_data )
 
-if not all([gwas_check, qtl_check]):
-    print(" * QTL and/or GWAS cannot be found in database")
+if not all([gwas_check]):
+    print(" * GWAS cannot be found in database")
     sys.exit()
 
+if not all([qtl_check]):
+    print(" * QTL cannot be found in database")
+    print(qtl_check)
+    sys.exit()
+
+
 shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;")
 rule all:
     input:
         #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data)
         outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
         outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz",
+        outFolder + "pairwise_QTL_results.tsv.gz"
         #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
         #outFolder + "all_COLOC_summary_results.tsv.gz"
 
@@ -46,7 +53,7 @@ rule merge_COLOC_snp_level:
     params:
         script = "scripts/merge_COLOC_results.R"
     shell:
-        "ml R/4.0.3;"
+        "ml R/3.6.0;" # LDLINKR only installed on 3.6
         "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" 
         "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} "
 
@@ -61,3 +68,25 @@ rule create_LD_tables:
     shell:
        "conda activate echoR;"
        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" 
+
+rule prepare_QTL_COLOC:
+    input:
+        outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"
+    output:
+        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
+    params:
+        script = "scripts/prepare_pairwise_qtl_coloc.R"
+    shell:
+        "ml R/4.2.0;"
+        "Rscript {params.script} -i {input} -o {output} -m across -p 0.8"
+
+rule run_QTL_COLOC:
+    input:
+        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
+    output:
+        outFolder + "pairwise_QTL_results.tsv.gz"
+    params:
+        script = "scripts/run_COLOC.R"
+    shell:
+        "ml R/4.2.0;"
+        "Rscript {params.script} -t {input} -o {output}"
diff --git a/COLOC/scripts/prepare_pairwise_qtl_coloc.R b/COLOC/scripts/prepare_pairwise_qtl_coloc.R
new file mode 100644
index 0000000..1c5028e
--- /dev/null
+++ b/COLOC/scripts/prepare_pairwise_qtl_coloc.R
@@ -0,0 +1,73 @@
+library(optparse)
+
+option_list <- list(
+        make_option(c('-o', '--outFile'), help='the path to the output file', default = ""),
+        make_option(c('-i', '--inFile'), help='the path to the input file', default = ""),
+        make_option(c('-m', '--mode'), help='mode - across or within?', default = "across"),
+        make_option(c('-p', '--pp'), help='min PP4', default = 0.5),
+        make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE)
+)
+
+option.parser <- OptionParser(option_list=option_list)
+opt <- parse_args(option.parser)
+
+inFile <- opt$inFile
+outFile <- opt$outFile
+mode <- opt$mode
+PP4 <- opt$pp
+library(tidyverse)
+
+#all_coloc <- read_tsv("../all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz")
+all_coloc <- read_tsv(inFile)
+
+all_coloc$distance <- abs(all_coloc$GWAS_pos - all_coloc$QTL_pos)
+all_coloc$distance_filter <- (!is.na(all_coloc$LD) & all_coloc$LD > 0.1) | all_coloc$distance < 5e5
+
+qtl_coloc_input <- all_coloc %>%
+  filter(distance_filter == TRUE & PP.H4.abf > PP4) %>%
+  select(GWAS, QTL, locus, feature) %>%
+  distinct()
+
+message(" * ", nrow(qtl_coloc_input), " phenotypes colocalize at PP4 > ", PP4)
+stopifnot( nrow(qtl_coloc_input) > 0)
+
+
+shared_loci <- select(qtl_coloc_input, GWAS, locus, QTL) %>%
+  distinct() %>%
+  group_by(GWAS, locus) %>%
+  tally() %>%
+  filter(n > 1)
+message(" * ", nrow(shared_loci), " loci are shared between QTL types")
+stopifnot( nrow(shared_loci) > 0)
+
+qtl_coloc_input  <- filter(qtl_coloc_input, locus %in% shared_loci$locus)
+
+# get all pairwise comparisons required
+# make optional - do you want to test features within a dataset or just across?
+qtl_coloc_split <- qtl_coloc_input %>%
+  split(paste0(.$GWAS, ":", .$locus)) %>%
+  map_df( ~{
+    d <- select(.x, GWAS,locus, QTL, feature) %>% distinct() 
+    qtl_pairs <- combn(unique(paste(d$QTL, d$feature) ), 2)
+    locus_df <- select(d, GWAS, locus)
+    pair_df <- 
+      data.frame(qtl_1 = qtl_pairs[1,], qtl_2 = qtl_pairs[2,] ) %>%
+      tidyr::separate(qtl_1, into = c("qtl_1", "feature_1"), sep = " ") %>%
+      tidyr::separate(qtl_2, into = c("qtl_2", "feature_2"), sep = " ") %>%
+      mutate(GWAS = unique(d$GWAS), locus = unique(d$locus)) %>%
+      select(GWAS, locus, everything() )
+
+  }) %>%
+  distinct()
+
+if( mode == "across"){
+
+qtl_coloc_split <- qtl_coloc_split %>%
+ mutate( within = qtl_1 == qtl_2) %>% filter(within == FALSE)
+}
+
+write_tsv(qtl_coloc_split, outFile) #"test_eQTL_edQTL_pairwise_input.tsv")
+
+
+
+ 
diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R
index 19b6f96..553c8c0 100644
--- a/COLOC/scripts/run_COLOC.R
+++ b/COLOC/scripts/run_COLOC.R
@@ -36,6 +36,7 @@ joinCoords <- function(input, flank = 0){
         coord <- splitCoords(input)
         coord$start <- coord$start - flank
         coord$end <- coord$end + flank
+        if( coord$start < 0){coord$start <- 0}
         coord <- paste0(coord$chr, ":", coord$start, "-", coord$end)    
         return(coord)
     }
@@ -155,7 +156,7 @@ matchMAF <- function(data, maf){
     # match on RSID
     message("   * before joining MAF: ", nrow(data), " SNPs")
     data$MAF <- maf$MAF[match(data$snp, maf$snp)]
-    matched <- data[ !is.na(data$MAF) ,]
+    matched <- data[ !is.na(data$MAF) & data$MAF >0 & data$MAF < 1,]
     message("   * after joining MAF: ", nrow(matched), " SNPs")
     if(nrow(matched) == 0 ){
         print(head(data) )
@@ -222,7 +223,6 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da
     #return(result)
     
     # deal with MAF if missing
-    if( debug == TRUE){ result$MAF <- NA }else{
         if( is.na(mafCol) | force_maf == TRUE ){
             message("       * MAF not present - using 1000 Genomes MAF")
             # how to get this in to the function? it can't find maf_1000g
@@ -231,11 +231,10 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da
             message("       * using supplied MAF")
             names(result)[names(col_dict) == mafCol] <- "MAF" 
         }
-    }
     # convert standard error to the variance
     # se is standard error - not standard deviation!
     result$varbeta <- result$varbeta^2
-    print(head(result))
+    #print(head(result))
     #save.image("debug.RData") 
     result <- dplyr::select( result, snp, pvalues, beta, varbeta, MAF, chr, pos, A1, A2)
     
@@ -348,7 +347,7 @@ extractQTL_tabix <- function(qtl, coord){
 # extract all nominal QTL P-values overlapping the flanked GWAS hit
 # split by Gene being tested
 # sig_level doesn't do anything yet
-extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){
+extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets = NULL){
     # variables stored in qtl
     # if qtl type is parquet then read in parquet files
     # if qtl type is tabix then query the coordinates through tabix 
@@ -379,7 +378,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){
     columns <- colnames(data.table::fread( cmd = cmd ))
     col_dict <- setNames(1:length(columns), columns)
     
-    print(col_dict)
+    #print(col_dict)
     #stopifnot( ncol(result) == length(col_dict) )
     col_dict <- col_dict[1:ncol(result)] # hacky workaround for now
     names(result)[names(col_dict) == phenoCol] <- "gene"
@@ -410,12 +409,12 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){
         #stopifnot( "chr" %in% names(col_dict) )
         names(result)[names(col_dict) == "chr"] <- "chr"
         result <- matchMAF(result, maf = maf_1000g)
-        print(head(result) )
+        #print(head(result) )
     }else{
         message("       * using supplied MAF")
         names(result)[names(col_dict) == mafCol] <- "MAF"
-        print(col_dict)
-        print(head(result) )
+        #print(col_dict)
+        #print(head(result) )
     }
     }
     # deal with edge cases - 1 gene and the gene id is NA
@@ -426,8 +425,8 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){
         result <- dplyr::filter(result, !is.na(gene) )
     }
     # if MAF needs matching then chrCol will be "chr" - change it back to "QTL_chr"
-    print( names(col_dict) )
-    print( names(col_dict) == chrCol )
+    #print( names(col_dict) )
+    #print( names(col_dict) == chrCol )
     names(result)[which(names(col_dict) == chrCol) ]   <- "QTL_chr"
     names(result)[which(names(col_dict) == posCol) ]   <- "QTL_pos"
 
@@ -439,10 +438,17 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){
     }else{
         res_subset <- dplyr::select(result, gene, snp, pvalues, MAF, QTL_chr, QTL_pos)
     }
-    print(head(res_subset) ) 
+    #print(head(res_subset) ) 
     print("passed this point")
     #dplyr::filter( pos >= coord_split$start & pos <= coord_split$end)
-       
+      
+    if( !is.null(targets) ){
+        res_subset <- dplyr::filter(res_subset, gene == targets)
+        print(paste0(length(unique(res_subset$gene)), " target features remain" ) ) 
+        if( nrow(res_subset) == 0 ){
+            return(NULL)
+        }
+    }
     # split by gene, convert to list object
     res_split <- 
         split(res_subset, res_subset$gene) %>%
@@ -505,7 +511,7 @@ getQTLinfo <- function(q, hit_info){
 ## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value
 ## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting.
 ## sig.level now filters out QTLs with P > sig.level
-runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){
+runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL){
     message(" * Analysing LOCUS: ", hit$locus )
     # hit is a dataframe containing "snp", "chr", "pos", "locus"
     
@@ -513,8 +519,10 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){
     hit_range <- joinCoords(hit, flank = 1e6)
  
     # extract GWAS and QTL summary for given coord
-    message("       * extracting GWAS")
-    g <- extractGWAS(gwas, hit_range)
+    if( is.null(qtl2) ){
+        message("       * extracting GWAS")
+        g <- extractGWAS(gwas, hit_range)
+    }
     # get hit info from GWAS
     hit_snp <- hit$snp
   
@@ -534,8 +542,8 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){
     qtl_range <- joinCoords(qtl_coord, flank = 1e6)
     
     message("       * extracting QTLs")
-    q <- extractQTL(qtl, qtl_range)
-    print(names(q[[1]]))
+    q <- extractQTL(qtl, qtl_range, targets = target.file[1])
+    #print(names(q[[1]]))
     if( is.null(q) ){ return(NULL) }
     qtl_info <- getQTLinfo(q, hit_info)
      # if sig filtering requested
@@ -547,7 +555,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){
 
     if( !is.null(qtl2) ){
         # if second QTL dataset requested
-        q2 <- extractQTL(qtl2, qtl_range)
+        q2 <- extractQTL(qtl2, qtl_range, targets = target.file[2])
         if( is.null(q2) ){ return(NULL) }
         qtl2_info <- getQTLinfo(q2, hit_info)
         if( !is.null(sig.level ) ){
@@ -596,7 +604,9 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){
             q_1 = q[[ qtl_combos[.x, "Var1"] ]]
             q_2 = q2[[ qtl_combos[.x, "Var2"] ]]
             message( " * ", .x, ": ", unique(q_1$gene), " vs ",  unique(q_2$gene) )
-             if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){
+            #message( head(q_1) )
+            #message( head(q_2) ) 
+            if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){
                 message("Hold up! GWAS and QTL have no SNPs in common")
                 return(NULL)
             }
@@ -656,10 +666,12 @@ calc_LD <- function( coloc_res ){
 
 option_list <- list(
         make_option(c('-o', '--outFolder'), help='the path to the output file', default = ""),
-        make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database" ),
+        make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database", default = NULL ),
         make_option(c('--qtl', '-q'), help = "the dataset ID for a QTL dataset in the GWAS/QTL database"),
         make_option(c('--qtl2', '-r'), help = "a second QTL dataset - using this will trigger QTL-QTL COLOC", default = NULL),
         make_option(c('--sig', '-s'), help = "the minimum p-value a QTL should have for testing", default = NULL),
+        make_option(c('--loci', '-l'), help = "text file of loci names", default = NULL),
+        make_option(c('--targets', '-t'), help = "TSV file of features, loci and GWAS for targeted analysis", default = NULL),
         make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE)
 )
 
@@ -673,55 +685,81 @@ qtl_dataset <- opt$qtl
 qtl_dataset_2 <- opt$qtl2
 debug <- opt$debug
 sig <- opt$sig
+loci <- opt$loci
+targets <- opt$targets
+
 
 #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz"
 maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz"
 
 
-main <- function(){
-    #gwas_dataset <- "Marioni_2018"
-    #qtl_dataset <- "Microglia_THA"
-    
-    gwas <- pullData(gwas_dataset, "GWAS")
-    qtl <- pullData(qtl_dataset, "QTL")
-    
-    if( !is.null(qtl_dataset_2) ){
-        qtl2 <- pullData(qtl_dataset_2, "QTL")
-    }
-    
-    top_loci <- extractLoci(gwas)
-    
-
-    # load in MAF table
-    if( !exists("maf_1000g")){
+# load in MAF table
+if( !exists("maf_1000g")){
 
-        maf_1000g <- loadMAF(maf_1000gp3)
-        # load in liftover chain
-        chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain")    
-    }
+  maf_1000g <- loadMAF(maf_1000gp3)
+  # load in liftover chain
+  chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain")    
+}
 
+main <- function(){
     if( debug == TRUE){ save.image("debug.RData") }
- 
-    all_coloc <- purrr::map(
+    # regular GWAS-QTL COLOC
+    if( is.null(targets) ){ 
+        gwas <- pullData(gwas_dataset, "GWAS")
+        qtl <- pullData(qtl_dataset, "QTL")
+    
+        if( !is.null(qtl_dataset_2) ){
+            qtl2 <- pullData(qtl_dataset_2, "QTL")
+        }else{ qtl2 <- NULL}
+        top_loci <- extractLoci(gwas)
+        print(paste0(" testing ", nrow(top_loci), " loci ") )
+
+        all_coloc <- purrr::map(
         1:nrow(top_loci), ~{
             res <- runCOLOC(gwas, qtl, qtl2, hit = top_loci[.x,], sig.level = sig)
             if(is.null(res) ){return(NULL) }
             return(res)
+        })
+    }
+    # QTL-QTL COLOC with pairwise targets
+    if(!is.null(targets) ){
+        targets <- read_tsv(targets)
+        print( paste0(" * testing ", nrow(targets), " pairs of QTLs" ) )
+        # for testing
+        #targets <- head(targets, 3)
+        all_coloc <- purrr::map(
+        1:nrow(targets), ~{
+            print( paste0(" * pair ", .x , " of ", nrow(targets) ) )
+            pair_df <- targets[.x,]
+            gwas <- pullData(pair_df$GWAS, "GWAS")
+            locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus)
+            qtl1 <- pullData(pair_df$qtl_1, "QTL")
+            qtl2 <- pullData(pair_df$qtl_2, "QTL")
+            features <- c(pair_df$feature_1, pair_df$feature_2)
+            res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features)
+            if(is.null(res) ){return(NULL) }
+            return(res)
         }
-    )
-    all_res <- map_df(all_coloc, "full_res")
-    all_obj <- map(all_coloc, "coloc_object")    
-    names(all_obj) <- top_loci$locus
-    all_res <- arrange(all_res, desc(PP.H4.abf)  )
-    
-    outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv")
-    if( !is.null(qtl_dataset_2)){
-        outFile <- paste0(outFolder, qtl_dataset, "_", qtl_dataset_2, "_", gwas_dataset, "_COLOC.tsv")
-    }    
+        )
+    }
 
+    all_res <- purrr::map_df(all_coloc, "full_res")
+    all_obj <- purrr::map(all_coloc, "coloc_object")    
+    if(!is.null(targets) ){
+        outFile <- outFolder
+        #outFile <- paste0(outFolder, unique(targets$locus), "_all_pairwise_QTL_COLOC.tsv")
+        all_res <- dplyr::bind_cols(targets, all_res %>% dplyr::select(-locus) )
+    }else{
+        outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv")
+        names(all_obj) <- top_loci$locus
+    }
+    all_res <- dplyr::arrange(all_res, desc(PP.H4.abf)  )
+    
     readr::write_tsv(all_res, path = outFile)
     # save COLOC objects for plotting
-    save(all_obj, file = gsub("tsv", "RData", outFile))
+    outData <- gsub("tsv.gz", "RData", outFile)
+    outData <- gsub("tsv", "RData", outFile)
+    save(all_obj, file = outData)
 }
 
 main()

From f4519b4b7b1a84b6079643625e5ff34e6e11b2f3 Mon Sep 17 00:00:00 2001
From: Jack Humphrey <jack.humphrey@mssm.edu>
Date: Wed, 3 Apr 2024 12:32:17 -0400
Subject: [PATCH 3/7] remove duplicate snp-genes caused by multi-allellic SNPs

---
 COLOC/scripts/run_COLOC.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R
index 553c8c0..9dfcf96 100644
--- a/COLOC/scripts/run_COLOC.R
+++ b/COLOC/scripts/run_COLOC.R
@@ -438,6 +438,8 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
     }else{
         res_subset <- dplyr::select(result, gene, snp, pvalues, MAF, QTL_chr, QTL_pos)
     }
+    # edge case - multiallelic SNPs lead to two entries per SNP-gene - remove
+    res_subset <- dplyr::distinct(res_subset)
     #print(head(res_subset) ) 
     print("passed this point")
     #dplyr::filter( pos >= coord_split$start & pos <= coord_split$end)

From 62d7a25eddef969bc9ffb56667147bbe7c549951 Mon Sep 17 00:00:00 2001
From: Jack Humphrey <jack.humphrey@mssm.edu>
Date: Thu, 11 Apr 2024 18:39:54 -0400
Subject: [PATCH 4/7] updates; still working on pairwise QTL coloc

---
 COLOC/pairwise_qtl_coloc.smk |  78 ++++++++++++++++++++++++++
 COLOC/scripts/run_COLOC.R    | 104 +++++++++++++++++++++--------------
 2 files changed, 142 insertions(+), 40 deletions(-)
 create mode 100644 COLOC/pairwise_qtl_coloc.smk

diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk
new file mode 100644
index 0000000..6fc7eac
--- /dev/null
+++ b/COLOC/pairwise_qtl_coloc.smk
@@ -0,0 +1,78 @@
+import sys
+import pandas as pd
+# Pairwise QTL COLOC pipeline
+
+outFolder = config["outFolder"]
+inFolder = config["inFolder"]
+
+target_files = [ x for x in glob.glob(inFolder + "/*genewide_QTL_pairs*tsv.gz" )]
+
+
+
+shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;")
+rule all:
+    input:
+        #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data)
+        #outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
+        #outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz",
+        #outFolder + "pairwise_QTL_results.tsv.gz"
+        #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
+        #outFolder + "all_COLOC_summary_results.tsv.gz"
+
+rule run_COLOC:
+    output:
+        outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv",
+        outFolder +  "{GWAS}/{QTL}_{GWAS}_COLOC.RData"
+    params:
+        script = "scripts/run_COLOC.R"
+    shell:
+        "mkdir -p {outFolder}{wildcards.GWAS};"
+        "ml R/4.0.3;" #ml arrow;"
+        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ "
+   
+rule merge_COLOC_snp_level:
+    input:
+        expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
+    output:
+        outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
+        outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"
+    params:
+        script = "scripts/merge_COLOC_results.R"
+    shell:
+        "ml R/3.6.0;" # LDLINKR only installed on 3.6
+        "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" 
+        "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} "
+
+rule create_LD_tables:
+    input:
+        outFolder +  "{GWAS}/{QTL}_{GWAS}_COLOC.RData"
+    output:
+        outFolder +  "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz"
+    params:
+        script = "scripts/prepare_gwas_qtls.R",
+        out = outFolder +  "{GWAS}/{QTL}/"
+    shell:
+       "conda activate echoR;"
+       "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" 
+
+rule prepare_QTL_COLOC:
+    input:
+        outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"
+    output:
+        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
+    params:
+        script = "scripts/prepare_pairwise_qtl_coloc.R"
+    shell:
+        "ml R/4.2.0;"
+        "Rscript {params.script} -i {input} -o {output} -m across -p 0.8"
+
+rule run_QTL_COLOC:
+    input:
+        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
+    output:
+        outFolder + "pairwise_QTL_results.tsv.gz"
+    params:
+        script = "scripts/run_COLOC.R"
+    shell:
+        "ml R/4.2.0;"
+        "Rscript {params.script} -t {input} -o {output}"
diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R
index 9dfcf96..6b7c217 100644
--- a/COLOC/scripts/run_COLOC.R
+++ b/COLOC/scripts/run_COLOC.R
@@ -26,6 +26,7 @@ suppressPackageStartupMessages(library(optparse))
 suppressPackageStartupMessages(library(GenomicRanges))
 suppressPackageStartupMessages(library(rtracklayer))
 
+options(future.globals.maxSize = 8000 * 1024^2)
 
 #library(arrow)
 
@@ -133,7 +134,7 @@ extractLoci <- function(gwas){
 # already filtered from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz
 # all sites with an RSID and 2+ alleles 
 loadMAF <- function(path){
-    message(" * loading in MAF from 1000 Genomes")
+    message(Sys.time(), " * loading in MAF from 1000 Genomes")
     maf <- data.table::fread(path, nThread = 4)
     names(maf) <- c("chr", "pos", "snp", "MAF")
     maf$chr <- as.character(maf$chr)
@@ -513,7 +514,8 @@ getQTLinfo <- function(q, hit_info){
 ## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value
 ## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting.
 ## sig.level now filters out QTLs with P > sig.level
-runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL){
+## bare - if TRUE then only return COLOC summary, not full object (saves memory)
+runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL, bare = FALSE){
     message(" * Analysing LOCUS: ", hit$locus )
     # hit is a dataframe containing "snp", "chr", "pos", "locus"
     
@@ -529,20 +531,24 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
     hit_snp <- hit$snp
   
     # problem when no p column in hit!
-
-    hit_info <- hit %>%
-            select(locus, GWAS_SNP = snp, GWAS_P = p)
-    # if GWAS is hg19 then lift over to hg38
-    if(gwas$build != qtl$build){
-        qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build)
+    if(is.null(gwas) ){
+        hit_info <- hit %>%
+            select(locus) %>% mutate(GWAS_SNP = NA, GWAS_P = NA)
+        qtl_range <- hit_range
     }else{
-        qtl_coord <- hit_coord
+        hit_info <- hit %>%
+            select(locus, GWAS_SNP = snp, GWAS_P = p)
+        # if GWAS is hg19 then lift over to hg38
+        if(gwas$build != qtl$build){
+            qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build)
+        }else{
+            qtl_coord <- hit_coord
+        }
+        # record the position of the GWAS top SNP
+        hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr
+        hit_info$GWAS_pos <- splitCoords(qtl_coord)$end
+        qtl_range <- joinCoords(qtl_coord, flank = 1e6)
     }
-    # record the position of the GWAS top SNP
-    hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr
-    hit_info$GWAS_pos <- splitCoords(qtl_coord)$end
-    qtl_range <- joinCoords(qtl_coord, flank = 1e6)
-    
     message("       * extracting QTLs")
     q <- extractQTL(qtl, qtl_range, targets = target.file[1])
     #print(names(q[[1]]))
@@ -606,8 +612,6 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
             q_1 = q[[ qtl_combos[.x, "Var1"] ]]
             q_2 = q2[[ qtl_combos[.x, "Var2"] ]]
             message( " * ", .x, ": ", unique(q_1$gene), " vs ",  unique(q_2$gene) )
-            #message( head(q_1) )
-            #message( head(q_2) ) 
             if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){
                 message("Hold up! GWAS and QTL have no SNPs in common")
                 return(NULL)
@@ -631,10 +635,11 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
     full_res <- full_join(hit_info, full_res, by = "locus")
     # remove rows with no coloc run
     full_res <- filter(full_res, !is.na(PP.H4.abf) )
-    # add pairwise LD between GWAS and QTL SNP
-    #full_res <- calc_LD(full_res)
-
-    return(list(full_res = full_res, coloc_object = coloc_res) ) 
+    if(bare == TRUE){
+        return(list(full_res = full_res) )
+    }else{
+        return(list(full_res = full_res, coloc_object = coloc_res) ) 
+    }
 }
 
 # for each COLOC result per locus
@@ -674,12 +679,14 @@ option_list <- list(
         make_option(c('--sig', '-s'), help = "the minimum p-value a QTL should have for testing", default = NULL),
         make_option(c('--loci', '-l'), help = "text file of loci names", default = NULL),
         make_option(c('--targets', '-t'), help = "TSV file of features, loci and GWAS for targeted analysis", default = NULL),
-        make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE)
+        make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE),
+        make_option(c('--lowmem', '-b'), help = "do not save COLOC objects, just the summaries", action = "store_true", default = FALSE),
+        make_option(c('--threads', '-c'), help = "how many threads to use", default = 1)
 )
 
 option.parser <- OptionParser(option_list=option_list)
 opt <- parse_args(option.parser)
-
+print(opt)
 
 outFolder <- opt$outFolder
 gwas_dataset <- opt$gwas
@@ -688,9 +695,9 @@ qtl_dataset_2 <- opt$qtl2
 debug <- opt$debug
 sig <- opt$sig
 loci <- opt$loci
-targets <- opt$targets
-
-
+target_file <- opt$targets
+low_mem <- opt$lowmem
+threads <- opt$threads
 #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz"
 maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz"
 
@@ -704,9 +711,10 @@ if( !exists("maf_1000g")){
 }
 
 main <- function(){
+    message(Sys.time()," * starting COLOC pipeline")
     if( debug == TRUE){ save.image("debug.RData") }
     # regular GWAS-QTL COLOC
-    if( is.null(targets) ){ 
+    if( is.null(target_file) ){ 
         gwas <- pullData(gwas_dataset, "GWAS")
         qtl <- pullData(qtl_dataset, "QTL")
     
@@ -724,31 +732,40 @@ main <- function(){
         })
     }
     # QTL-QTL COLOC with pairwise targets
-    if(!is.null(targets) ){
-        targets <- read_tsv(targets)
+    if(!is.null(target_file) ){
+        targets <- read_tsv(target_file)
         print( paste0(" * testing ", nrow(targets), " pairs of QTLs" ) )
         # for testing
-        #targets <- head(targets, 3)
-        all_coloc <- purrr::map(
+        targets <- head(targets, 100)
+        if(threads > 1){
+            suppressPackageStartupMessages(library(furrr))
+            plan(multisession, workers = threads)
+            map <- future_map
+        }
+        all_coloc <- map(
         1:nrow(targets), ~{
             print( paste0(" * pair ", .x , " of ", nrow(targets) ) )
             pair_df <- targets[.x,]
-            gwas <- pullData(pair_df$GWAS, "GWAS")
-            locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus)
+            if( "GWAS" %in% names(pair_df) ){
+                gwas <- pullData(pair_df$GWAS, "GWAS") 
+                locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus)
+            }else{
+                locus <- pair_df
+                gwas <- NULL
+            }
             qtl1 <- pullData(pair_df$qtl_1, "QTL")
             qtl2 <- pullData(pair_df$qtl_2, "QTL")
             features <- c(pair_df$feature_1, pair_df$feature_2)
-            res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features)
+            res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features, bare = low_mem)
             if(is.null(res) ){return(NULL) }
             return(res)
         }
         )
     }
-
+    
     all_res <- purrr::map_df(all_coloc, "full_res")
-    all_obj <- purrr::map(all_coloc, "coloc_object")    
-    if(!is.null(targets) ){
-        outFile <- outFolder
+    if(!is.null(target_file) ){
+        outFile <- gsub(".tsv.gz", "_COLOC.tsv.gz", target_file)
         #outFile <- paste0(outFolder, unique(targets$locus), "_all_pairwise_QTL_COLOC.tsv")
         all_res <- dplyr::bind_cols(targets, all_res %>% dplyr::select(-locus) )
     }else{
@@ -756,12 +773,19 @@ main <- function(){
         names(all_obj) <- top_loci$locus
     }
     all_res <- dplyr::arrange(all_res, desc(PP.H4.abf)  )
+    message(Sys.time()," * finished COLOC pipeline")
     
     readr::write_tsv(all_res, path = outFile)
     # save COLOC objects for plotting
-    outData <- gsub("tsv.gz", "RData", outFile)
-    outData <- gsub("tsv", "RData", outFile)
-    save(all_obj, file = outData)
+    if( !low_mem ){
+        all_obj <- purrr::map(all_coloc, "coloc_object")
+        if( is.null(targets) ){
+            names(all_obj) <- top_loci$locus
+        }
+        outData <- gsub("tsv.gz", "RData", outFile)
+        outData <- gsub("tsv", "RData", outFile)
+        save(all_obj, file = outData)
+    }
 }
 
 main()

From 9a29744e3dbd4d89f97c59be94847281520126f5 Mon Sep 17 00:00:00 2001
From: Jack Humphrey <jack.humphrey@mssm.edu>
Date: Thu, 5 Sep 2024 16:03:07 -0400
Subject: [PATCH 5/7] fixed bug on outfolder

---
 COLOC/cluster.yaml           |   5 +-
 COLOC/pairwise_qtl_coloc.smk |  74 ++++----------------
 COLOC/scripts/run_COLOC.R    | 128 +++++++++++++++++++++--------------
 GWAS/processGWAS.R           |   4 +-
 4 files changed, 97 insertions(+), 114 deletions(-)

diff --git a/COLOC/cluster.yaml b/COLOC/cluster.yaml
index 337150f..e5eddc7 100644
--- a/COLOC/cluster.yaml
+++ b/COLOC/cluster.yaml
@@ -1,5 +1,4 @@
 __default__:
-  #partition: chimera
   queue: premium
   cores: 1
   mem: 24000
@@ -8,3 +7,7 @@ __default__:
   output: logs/{rule}:{wildcards}.stdout
   error: logs/{rule}:{wildcards}.stderr
   himem: ""
+pairwise_QTL_COLOC:
+  cores: 4
+  mem: 8000
+  time: 8640
diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk
index 6fc7eac..478e166 100644
--- a/COLOC/pairwise_qtl_coloc.smk
+++ b/COLOC/pairwise_qtl_coloc.smk
@@ -1,78 +1,30 @@
 import sys
 import pandas as pd
-# Pairwise QTL COLOC pipeline
+import glob
+import re
+import os
 
+# Pairwise QTL COLOC pipeline
+# Jack Humphrey
 outFolder = config["outFolder"]
 inFolder = config["inFolder"]
 
-target_files = [ x for x in glob.glob(inFolder + "/*genewide_QTL_pairs*tsv.gz" )]
-
+target_files = [ x for x in glob.glob(inFolder + "*genewide_QTL_pairs*tsv.gz" )]
+target_ids = [os.path.basename(re.sub('.tsv.gz', '', x, count=1)) for x in target_files ]
 
 
 shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;")
 rule all:
     input:
-        #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data)
-        #outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
-        #outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz",
-        #outFolder + "pairwise_QTL_results.tsv.gz"
-        #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
-        #outFolder + "all_COLOC_summary_results.tsv.gz"
-
-rule run_COLOC:
-    output:
-        outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv",
-        outFolder +  "{GWAS}/{QTL}_{GWAS}_COLOC.RData"
-    params:
-        script = "scripts/run_COLOC.R"
-    shell:
-        "mkdir -p {outFolder}{wildcards.GWAS};"
-        "ml R/4.0.3;" #ml arrow;"
-        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ "
-   
-rule merge_COLOC_snp_level:
-    input:
-        expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
-    output:
-        outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
-        outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"
-    params:
-        script = "scripts/merge_COLOC_results.R"
-    shell:
-        "ml R/3.6.0;" # LDLINKR only installed on 3.6
-        "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" 
-        "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} "
-
-rule create_LD_tables:
-    input:
-        outFolder +  "{GWAS}/{QTL}_{GWAS}_COLOC.RData"
-    output:
-        outFolder +  "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz"
-    params:
-        script = "scripts/prepare_gwas_qtls.R",
-        out = outFolder +  "{GWAS}/{QTL}/"
-    shell:
-       "conda activate echoR;"
-       "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" 
-
-rule prepare_QTL_COLOC:
-    input:
-        outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"
-    output:
-        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
-    params:
-        script = "scripts/prepare_pairwise_qtl_coloc.R"
-    shell:
-        "ml R/4.2.0;"
-        "Rscript {params.script} -i {input} -o {output} -m across -p 0.8"
+        expand( outFolder + "{target}_COLOC.tsv.gz", target = target_ids)
 
-rule run_QTL_COLOC:
+rule pairwise_QTL_COLOC:
     input:
-        outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
+        inFolder + "{target}.tsv.gz"
     output:
-        outFolder + "pairwise_QTL_results.tsv.gz"
+        outFolder + "{target}_COLOC.tsv.gz"
     params:
         script = "scripts/run_COLOC.R"
     shell:
-        "ml R/4.2.0;"
-        "Rscript {params.script} -t {input} -o {output}"
+        "ml R;" 
+        "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1"
diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R
index 6b7c217..4755024 100644
--- a/COLOC/scripts/run_COLOC.R
+++ b/COLOC/scripts/run_COLOC.R
@@ -30,6 +30,20 @@ options(future.globals.maxSize = 8000 * 1024^2)
 
 #library(arrow)
 
+message_ <- function(text){
+    message(Sys.time(), " * ", text)   
+}
+## verbose messaging
+message_verbose <- function(input){
+    if(verbose){
+        if( "data.frame" %in% class(input)){
+            print(input)
+        }
+        if( class(input) == "character"){
+            message(Sys.time(), "     * ", input)
+        }
+    }
+}
 # flank coordinates by set number of bases (default 1MB)
 # work on either a coordinate string or a dataframe containing chr start and end columns
 joinCoords <- function(input, flank = 0){
@@ -45,10 +59,15 @@ joinCoords <- function(input, flank = 0){
         stopifnot( all(c("chr", "start", "end") %in% names(input) ) | all(c("chr","pos") %in% names(input) )  )
         stopifnot( flank >= 0)
         if( all( c("chr","start","end") %in% names(input) ) ){
-            coords <- paste0(input$chr, ":", input$start - flank, "-",input$end + flank) 
+            # deal with flanks that go negative
+            starts <- input$start - flank
+            starts[starts < 0 ] <- 1
+            coords <- paste0(input$chr, ":", starts, "-",input$end + flank) 
         }
         if( all( c("chr", "pos") %in% names(input) ) ){
-            coords <- paste0(input$chr, ":", input$pos - (flank + 1), "-", input$pos + flank)
+            starts <- input$pos - (flank + 1)
+            starts[starts < 0 ] <- 1
+            coords <- paste0(input$chr, ":", starts, "-", input$pos + flank)
         }
         return(coords)
     }
@@ -63,10 +82,10 @@ splitCoords <- function(coords){
 }
 
 pullData <- function(dataset, type = "GWAS"){
-    message(Sys.time()," * selected dataset: ", dataset)
+    message_verbose(paste0("selected dataset: ", dataset))
     db_path <- "/sc/arion/projects/ad-omics/data/references/GWAS/GWAS-QTL_data_dictionary.xlsx"
 
-    message(Sys.time()," * reading GWAS database from ", db_path)
+    message_verbose(paste0("reading GWAS database from ", db_path))
     stopifnot( file.exists(db_path) )
 
     gwas_db <- suppressMessages(readxl::read_excel(db_path, sheet = type, na= c("", "-","NA")))
@@ -134,7 +153,7 @@ extractLoci <- function(gwas){
 # already filtered from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz
 # all sites with an RSID and 2+ alleles 
 loadMAF <- function(path){
-    message(Sys.time(), " * loading in MAF from 1000 Genomes")
+    message_("loading in MAF from 1000 Genomes")
     maf <- data.table::fread(path, nThread = 4)
     names(maf) <- c("chr", "pos", "snp", "MAF")
     maf$chr <- as.character(maf$chr)
@@ -155,12 +174,12 @@ matchMAF <- function(data, maf){
     # subset maf by chromosome
     maf <- maf[chr] 
     # match on RSID
-    message("   * before joining MAF: ", nrow(data), " SNPs")
+    message_verbose(paste0("before joining MAF: ", nrow(data), " SNPs"))
     data$MAF <- maf$MAF[match(data$snp, maf$snp)]
     matched <- data[ !is.na(data$MAF) & data$MAF >0 & data$MAF < 1,]
-    message("   * after joining MAF: ", nrow(matched), " SNPs")
+    message_verbose(paste0("after joining MAF: ", nrow(matched), " SNPs"))
     if(nrow(matched) == 0 ){
-        print(head(data) )
+        message_verbose(head(data) )
     stop("no SNPs match on RSID. Inspect data")
     }
     return(matched)
@@ -201,10 +220,10 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da
     }
      
     cmd <- paste( "ml bcftools; tabix -h ", gwas_path, coord ) 
-    message(" * running command: ", cmd)
+    message_verbose(paste0("running command: ", cmd) )
  
     result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) )
-    message(" * GWAS extracted!")
+    message_verbose("GWAS extracted!")
     stopifnot( nrow(result) > 0 )
     
      
@@ -225,11 +244,11 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da
     
     # deal with MAF if missing
         if( is.na(mafCol) | force_maf == TRUE ){
-            message("       * MAF not present - using 1000 Genomes MAF")
+            message_verbose("MAF not present - using 1000 Genomes MAF")
             # how to get this in to the function? it can't find maf_1000g
             result <- matchMAF(result, maf = maf_1000g)
         }else{
-            message("       * using supplied MAF")
+            message_verbose("using supplied MAF")
             names(result)[names(col_dict) == mafCol] <- "MAF" 
         }
     # convert standard error to the variance
@@ -276,9 +295,9 @@ liftOverCoord <- function(coord_string, from = "hg19", to = "hg38"){
         )
     # lift over using rtracklayer
     # watch out for duplicate entries
-    message(" * lifting over....")
+    message_verbose("lifting over....")
     lifted_over <- rtracklayer::liftOver(coord_gr, chain )
-    message(" * lifted!" )
+    message_verbose("lifted!" )
     #return(lifted_over)
     
     stopifnot(length(lifted_over) == length(coord_gr)  )
@@ -308,7 +327,7 @@ extractQTL_parquet <- function(qtl, coord, sig_level = 0.05){
     perm_res <- dplyr::bind_cols(perm_res, splitCoords(perm_res$variant_id) )
     sig <- dplyr::filter(perm_res, qval < sig_level & chr ==  coord_split$chr & start >= coord_split$start & start <= coord_split$end )
     
-    message( paste0( nrow(sig), " significant genes or splicing events at this locus" ) )
+    message_verbose( paste0( nrow(sig), " significant genes or splicing events at this locus" ) )
     if( nrow(sig) == 0 ){ return(NULL) }
     
     # read in nominal QTL associations
@@ -333,14 +352,14 @@ extractQTL_tabix <- function(qtl, coord){
     }
     stopifnot( file.exists(qtl$full_path) )
     cmd <- paste( "ml bcftools; tabix -h ", qtl$full_path, coord )
-    message("       * running command: ", cmd)
+    message_verbose(paste0("running command: ", cmd))
     result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) )
     # deal with regions of no QTL association
     if( nrow(result) == 0){ 
         return(NULL) 
     }
     #stopifnot( nrow(result) > 0 )
-    message("       * QTL extracted!")
+    message_verbose("QTL extracted!")
     return(result)
 }
 
@@ -406,13 +425,13 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
     # this requires "chr" to be present in QTL data
     if( debug == TRUE){ result$MAF <- NA }else{
     if( is.na(mafCol) | force_maf == TRUE ){
-        message("       * MAF not present - using 1000 Genomes MAF")
+        message_verbose("MAF not present - using 1000 Genomes MAF")
         #stopifnot( "chr" %in% names(col_dict) )
         names(result)[names(col_dict) == "chr"] <- "chr"
         result <- matchMAF(result, maf = maf_1000g)
         #print(head(result) )
     }else{
-        message("       * using supplied MAF")
+        message_verbose("using supplied MAF")
         names(result)[names(col_dict) == mafCol] <- "MAF"
         #print(col_dict)
         #print(head(result) )
@@ -420,7 +439,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
     }
     # deal with edge cases - 1 gene and the gene id is NA
     if( all( is.na(result$gene) ) ){
-        message(" * no genes in QTL result")
+        message_verbose("no genes in QTL result")
         return(NULL)
     }else{
         result <- dplyr::filter(result, !is.na(gene) )
@@ -442,12 +461,14 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
     # edge case - multiallelic SNPs lead to two entries per SNP-gene - remove
     res_subset <- dplyr::distinct(res_subset)
     #print(head(res_subset) ) 
-    print("passed this point")
+    message_verbose("passed this point")
+    #save(list = ls(all.names = TRUE), file = "image.RData", envir = 
+    #environment())
     #dplyr::filter( pos >= coord_split$start & pos <= coord_split$end)
       
     if( !is.null(targets) ){
         res_subset <- dplyr::filter(res_subset, gene == targets)
-        print(paste0(length(unique(res_subset$gene)), " target features remain" ) ) 
+        message_verbose(paste0(length(unique(res_subset$gene)), " target features remain" ) ) 
         if( nrow(res_subset) == 0 ){
             return(NULL)
         }
@@ -490,7 +511,7 @@ getQTLinfo <- function(q, hit_info){
             }
         }) 
         qtl_info <- bind_cols(qtl_info, gwas_info)
-        print(head(qtl_info) )
+        message_verbose(head(qtl_info) )
     }else{
         qtl_info <- q %>% 
         map_df( ~{ 
@@ -516,7 +537,7 @@ getQTLinfo <- function(q, hit_info){
 ## sig.level now filters out QTLs with P > sig.level
 ## bare - if TRUE then only return COLOC summary, not full object (saves memory)
 runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL, bare = FALSE){
-    message(" * Analysing LOCUS: ", hit$locus )
+    message_(paste0("Analysing locus: ", hit$locus ))
     # hit is a dataframe containing "snp", "chr", "pos", "locus"
     
     hit_coord <- joinCoords(hit, flank = 0)
@@ -524,7 +545,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
  
     # extract GWAS and QTL summary for given coord
     if( is.null(qtl2) ){
-        message("       * extracting GWAS")
+        message_verbose("extracting GWAS")
         g <- extractGWAS(gwas, hit_range)
     }
     # get hit info from GWAS
@@ -549,7 +570,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
         hit_info$GWAS_pos <- splitCoords(qtl_coord)$end
         qtl_range <- joinCoords(qtl_coord, flank = 1e6)
     }
-    message("       * extracting QTLs")
+    message_verbose("extracting QTLs")
     q <- extractQTL(qtl, qtl_range, targets = target.file[1])
     #print(names(q[[1]]))
     if( is.null(q) ){ return(NULL) }
@@ -558,7 +579,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
     if( !is.null(sig.level ) ){
         qtl_info <- filter(qtl_info, QTL_P < sig.level)
         q <- q[ qtl_info$gene ] 
-         message(" * ", length(q), " features in QTL")
+         message_verbose(paste0(length(q), " features in QTL"))
     }
 
     if( !is.null(qtl2) ){
@@ -570,14 +591,14 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
             qtl2_info <- filter(qtl2_info, QTL_P < sig.level)
             q2 <- q2[ qtl2_info$gene ]
         }
-        message(" * ", length(q2), " features in QTL 2")
+        message_verbose( paste0(length(q2), " features in QTL 2"))
     }
     # actually run COLOC
-    message("      * running COLOC")
+    message_("running COLOC")
     # return the results table and an object containing g and q
     # GWAS-QTL COLOC
     if( is.null(qtl2) ){
-    message(" * GWAS-QTL COLOC")
+    message_("GWAS-QTL COLOC")
     coloc_res <- 
         purrr::map( q, ~{
             if( length(intersect(g$snp, .x$snp) ) == 0 ){ 
@@ -592,11 +613,11 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
             return( list(df = coloc_df, object = coloc_object) )
         })
     }else{
-    message(" * QTL-QTL COLOC")
+    message_("QTL-QTL COLOC")
     # remove features with lead SNPs > 1e-5
     # test all pairs of QTL features in the locus
     qtl_combos <- expand.grid(1:length(q), 1:length(q2) )
-    message(" * testing ", nrow(qtl_combos), " pairs of QTLs")
+    message_(paste0("testing ", nrow(qtl_combos), " pairs of QTLs"))
     # qtl_info is matched on to COLOC results using gene names - but for QTL-QTL there are two genes, so instead use the index of the pair
     qtl_info <- 
         full_join( 
@@ -611,9 +632,9 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
         purrr::map( 1:nrow(qtl_combos), ~{
             q_1 = q[[ qtl_combos[.x, "Var1"] ]]
             q_2 = q2[[ qtl_combos[.x, "Var2"] ]]
-            message( " * ", .x, ": ", unique(q_1$gene), " vs ",  unique(q_2$gene) )
+            message_( paste0( .x, ": ", unique(q_1$gene), " vs ",  unique(q_2$gene) ) )
             if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){
-                message("Hold up! GWAS and QTL have no SNPs in common")
+                message_("Hold up! GWAS and QTL have no SNPs in common")
                 return(NULL)
             }
             coloc_object <- coloc.abf(dataset1 = q_1, dataset2 = q_2)
@@ -625,11 +646,11 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
     }
     if( is.null(coloc_res) ){ return(NULL) }
 
-    message("       * COLOC finished")
+    message_("COLOC finished")
     coloc_df <- map_df(coloc_res, "df", .id = "gene") %>% 
         dplyr::mutate(locus = hit$locus ) %>%
         dplyr::select(locus, gene, everything() )
-    message("       * COLOC res created")
+    message_("COLOC res created")
     # bind coloc_res to other tables
     full_res <- full_join(qtl_info, coloc_df, by = "gene")
     full_res <- full_join(hit_info, full_res, by = "locus")
@@ -649,7 +670,7 @@ calc_LD <- function( coloc_res ){
     # get LDlink token from .Renviron (only Jack has this, for access go to https://ldlink.nci.nih.gov/?tab=apiaccess)
     token <- Sys.getenv("LDLINK_TOKEN")
     if( token == ""){ 
-        warning(" * LDlink token not found!" )
+        warning(Sys.time(), " * LDlink token not found!" )
         return(NA)
     }
     snps <- unique( c( unique(coloc_res$GWAS_SNP), unique( coloc_res$QTL_SNP) ) )
@@ -681,12 +702,12 @@ option_list <- list(
         make_option(c('--targets', '-t'), help = "TSV file of features, loci and GWAS for targeted analysis", default = NULL),
         make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE),
         make_option(c('--lowmem', '-b'), help = "do not save COLOC objects, just the summaries", action = "store_true", default = FALSE),
-        make_option(c('--threads', '-c'), help = "how many threads to use", default = 1)
+        make_option(c('--threads', '-c'), help = "how many threads to use", default = 1),
+        make_option(c('--verbose', '-v'), help = "if selected then output more information", action = "store_true", default = FALSE)
 )
 
 option.parser <- OptionParser(option_list=option_list)
 opt <- parse_args(option.parser)
-print(opt)
 
 outFolder <- opt$outFolder
 gwas_dataset <- opt$gwas
@@ -698,9 +719,11 @@ loci <- opt$loci
 target_file <- opt$targets
 low_mem <- opt$lowmem
 threads <- opt$threads
+verbose <- opt$verbose
 #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz"
 maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz"
 
+if(verbose){print(opt)}
 
 # load in MAF table
 if( !exists("maf_1000g")){
@@ -711,10 +734,11 @@ if( !exists("maf_1000g")){
 }
 
 main <- function(){
-    message(Sys.time()," * starting COLOC pipeline")
+    message_("starting COLOC pipeline")
     if( debug == TRUE){ save.image("debug.RData") }
     # regular GWAS-QTL COLOC
     if( is.null(target_file) ){ 
+        message_("GWAS - QTL COLOC")
         gwas <- pullData(gwas_dataset, "GWAS")
         qtl <- pullData(qtl_dataset, "QTL")
     
@@ -722,7 +746,7 @@ main <- function(){
             qtl2 <- pullData(qtl_dataset_2, "QTL")
         }else{ qtl2 <- NULL}
         top_loci <- extractLoci(gwas)
-        print(paste0(" testing ", nrow(top_loci), " loci ") )
+        message_(paste0(" testing ", nrow(top_loci), " loci ") )
 
         all_coloc <- purrr::map(
         1:nrow(top_loci), ~{
@@ -733,10 +757,13 @@ main <- function(){
     }
     # QTL-QTL COLOC with pairwise targets
     if(!is.null(target_file) ){
+        message_("QTL-QTL COLOC")
         targets <- read_tsv(target_file)
-        print( paste0(" * testing ", nrow(targets), " pairs of QTLs" ) )
-        # for testing
-        targets <- head(targets, 100)
+        # for TESTING
+        #targets <- head(targets, 3)
+        message_( paste0("testing ", nrow(targets), " pairs of QTLs" ) )
+        # doesn't work properly - furrr loads the whole MAF object into memory for each parallel run, 100MB per run which is crazy.
+        # for parallel to work I would have to transition to querying the MAF database each time like we do with the sumstats. 
         if(threads > 1){
             suppressPackageStartupMessages(library(furrr))
             plan(multisession, workers = threads)
@@ -744,12 +771,13 @@ main <- function(){
         }
         all_coloc <- map(
         1:nrow(targets), ~{
-            print( paste0(" * pair ", .x , " of ", nrow(targets) ) )
+            message_verbose( paste0("Pair ", .x , " of ", nrow(targets) ) )
             pair_df <- targets[.x,]
             if( "GWAS" %in% names(pair_df) ){
                 gwas <- pullData(pair_df$GWAS, "GWAS") 
                 locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus)
             }else{
+                message_verbose("no GWAS specified - using chr and pos from targets file")
                 locus <- pair_df
                 gwas <- NULL
             }
@@ -757,6 +785,7 @@ main <- function(){
             qtl2 <- pullData(pair_df$qtl_2, "QTL")
             features <- c(pair_df$feature_1, pair_df$feature_2)
             res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features, bare = low_mem)
+            gc()
             if(is.null(res) ){return(NULL) }
             return(res)
         }
@@ -765,16 +794,15 @@ main <- function(){
     
     all_res <- purrr::map_df(all_coloc, "full_res")
     if(!is.null(target_file) ){
-        outFile <- gsub(".tsv.gz", "_COLOC.tsv.gz", target_file)
-        #outFile <- paste0(outFolder, unique(targets$locus), "_all_pairwise_QTL_COLOC.tsv")
-        all_res <- dplyr::bind_cols(targets, all_res %>% dplyr::select(-locus) )
+        outFile <- file.path(outFolder, gsub(".tsv.gz", "_COLOC.tsv.gz", basename(target_file) ) )
+        all_res <- dplyr::bind_cols(targets, all_res %>% select(-locus) )
     }else{
         outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv")
         names(all_obj) <- top_loci$locus
     }
     all_res <- dplyr::arrange(all_res, desc(PP.H4.abf)  )
-    message(Sys.time()," * finished COLOC pipeline")
-    
+    message("finished COLOC pipeline")
+    message_verbose(paste0("writing output to ", outFile))
     readr::write_tsv(all_res, path = outFile)
     # save COLOC objects for plotting
     if( !low_mem ){
diff --git a/GWAS/processGWAS.R b/GWAS/processGWAS.R
index 12a1835..97fc82a 100644
--- a/GWAS/processGWAS.R
+++ b/GWAS/processGWAS.R
@@ -81,7 +81,7 @@ get_standard_error <- function(OR, P){
 # write out sorted file to reference folder
 # tabix index the file
 # Ripke has loci with chr23 for X - remove
-sortGWAS <- function(gwas, out_folder = "./"){
+sortGWAS <- function(gwas, out_folder = ""){
     dataset <- gwas$dataset
     
     col_chr <- gwas$full_chrom
@@ -127,7 +127,7 @@ sortGWAS <- function(gwas, out_folder = "./"){
     # remove duplicate rows
     gwas_sorted <- dplyr::distinct(gwas_sorted)
     
-    out_path <- paste0(out_folder, dataset, ".processed.tsv")
+    out_path <- file.path(out_folder, paste0(dataset, ".processed.tsv") )
     message(Sys.time()," * writing sorted GWAS file to ", out_path)
     readr::write_tsv(gwas_sorted, out_path)
     stopifnot(file.exists(out_path) )

From 432626b41584d23eeb49e5427627caf96162b825 Mon Sep 17 00:00:00 2001
From: Jack Humphrey <jack.humphrey@mssm.edu>
Date: Mon, 24 Feb 2025 15:42:46 -0500
Subject: [PATCH 6/7] updates - allow custom MAF, fix bugs

---
 COLOC/pairwise_qtl_coloc.smk |  5 +--
 COLOC/scripts/run_COLOC.R    | 69 +++++++++++++++++++++++++-----------
 2 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk
index 478e166..a79653f 100644
--- a/COLOC/pairwise_qtl_coloc.smk
+++ b/COLOC/pairwise_qtl_coloc.smk
@@ -8,8 +8,9 @@ import os
 # Jack Humphrey
 outFolder = config["outFolder"]
 inFolder = config["inFolder"]
+MAF_file = config["MAF_file"]
 
-target_files = [ x for x in glob.glob(inFolder + "*genewide_QTL_pairs*tsv.gz" )]
+target_files = [ x for x in glob.glob(inFolder + "*genewide*pairs*tsv.gz" )]
 target_ids = [os.path.basename(re.sub('.tsv.gz', '', x, count=1)) for x in target_files ]
 
 
@@ -27,4 +28,4 @@ rule pairwise_QTL_COLOC:
         script = "scripts/run_COLOC.R"
     shell:
         "ml R;" 
-        "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1"
+        "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1 --MAF {MAF_file}"
diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R
index 4755024..1904335 100644
--- a/COLOC/scripts/run_COLOC.R
+++ b/COLOC/scripts/run_COLOC.R
@@ -153,9 +153,9 @@ extractLoci <- function(gwas){
 # already filtered from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz
 # all sites with an RSID and 2+ alleles 
 loadMAF <- function(path){
-    message_("loading in MAF from 1000 Genomes")
+    #message_("loading in MAF from 1000 Genomes")
     maf <- data.table::fread(path, nThread = 4)
-    names(maf) <- c("chr", "pos", "snp", "MAF")
+    names(maf)[1:4] <- c("chr", "pos", "snp", "MAF")
     maf$chr <- as.character(maf$chr)
     data.table::setkey(maf, "chr")
     maf$MAF <- suppressWarnings(as.numeric(maf$MAF))
@@ -179,8 +179,9 @@ matchMAF <- function(data, maf){
     matched <- data[ !is.na(data$MAF) & data$MAF >0 & data$MAF < 1,]
     message_verbose(paste0("after joining MAF: ", nrow(matched), " SNPs"))
     if(nrow(matched) == 0 ){
-        message_verbose(head(data) )
-    stop("no SNPs match on RSID. Inspect data")
+        #message_verbose(head(data) )
+        message_verbose("no SNPs match on RSID. Inspect data")
+        return(NULL)
     }
     return(matched)
 }
@@ -219,7 +220,7 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da
         }
     }
      
-    cmd <- paste( "ml bcftools; tabix -h ", gwas_path, coord ) 
+    cmd <- paste( "ml bcftools/1.9; tabix -h ", gwas_path, coord ) 
     message_verbose(paste0("running command: ", cmd) )
  
     result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) )
@@ -351,7 +352,7 @@ extractQTL_tabix <- function(qtl, coord){
         }
     }
     stopifnot( file.exists(qtl$full_path) )
-    cmd <- paste( "ml bcftools; tabix -h ", qtl$full_path, coord )
+    cmd <- paste( "ml bcftools/1.9; tabix -h ", qtl$full_path, coord )
     message_verbose(paste0("running command: ", cmd))
     result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) )
     # deal with regions of no QTL association
@@ -411,9 +412,11 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
         names(result)[names(col_dict) == seCol]    <- "varbeta"
         # don't forget to square the standard error to get the variance             
         result$varbeta <- result$varbeta^2
- 
+        message_verbose(" remove associations with 0 variance")
+        message_verbose(paste0(" before: ", nrow(result), " associations"))
         # remove rows with variance of 0 - this will corrupt COLOC
         result <- dplyr::filter(result, varbeta != 0) 
+        message_verbose(paste0(" after: ", nrow(result), " associations"))
     }
     names(result)[names(col_dict) == snpCol]   <- "snp"
     # Young microglia use log10P - convert
@@ -423,12 +426,13 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
 
     # deal with MAF - meta-analysis outputs won't have it
     # this requires "chr" to be present in QTL data
-    if( debug == TRUE){ result$MAF <- NA }else{
+    #if( debug == TRUE){ result$MAF <- NA }else{ # why do we do this? breaks it when debug is TRUE
     if( is.na(mafCol) | force_maf == TRUE ){
         message_verbose("MAF not present - using 1000 Genomes MAF")
         #stopifnot( "chr" %in% names(col_dict) )
         names(result)[names(col_dict) == "chr"] <- "chr"
         result <- matchMAF(result, maf = maf_1000g)
+        if( all(is.na(result$MAF) ) | is.null(result) ){ return(NULL) }
         #print(head(result) )
     }else{
         message_verbose("using supplied MAF")
@@ -436,7 +440,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
         #print(col_dict)
         #print(head(result) )
     }
-    }
+    #}
     # deal with edge cases - 1 gene and the gene id is NA
     if( all( is.na(result$gene) ) ){
         message_verbose("no genes in QTL result")
@@ -458,8 +462,14 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
     }else{
         res_subset <- dplyr::select(result, gene, snp, pvalues, MAF, QTL_chr, QTL_pos)
     }
+    # jan 2025: another edge case - NA pvalues
+    message_verbose("removing NA P-values")
+    res_subset <- filter(res_subset, !is.na(pvalues) )
+    message_verbose(paste0("after: ", nrow(res_subset) ))
     # edge case - multiallelic SNPs lead to two entries per SNP-gene - remove
+    message_verbose("removing multi-allelic SNPs")
     res_subset <- dplyr::distinct(res_subset)
+    message_verbose(paste0("after: ", nrow(res_subset) ))
     #print(head(res_subset) ) 
     message_verbose("passed this point")
     #save(list = ls(all.names = TRUE), file = "image.RData", envir = 
@@ -469,6 +479,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
     if( !is.null(targets) ){
         res_subset <- dplyr::filter(res_subset, gene == targets)
         message_verbose(paste0(length(unique(res_subset$gene)), " target features remain" ) ) 
+        message_verbose(paste0(nrow(res_subset), " associations remain"))
         if( nrow(res_subset) == 0 ){
             return(NULL)
         }
@@ -586,7 +597,12 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
         # if second QTL dataset requested
         q2 <- extractQTL(qtl2, qtl_range, targets = target.file[2])
         if( is.null(q2) ){ return(NULL) }
-        qtl2_info <- getQTLinfo(q2, hit_info)
+        # get QTL betas for lead SNP in QTL 1
+        hit_info_qtl1 <- data.frame(locus = hit_info$locus, GWAS_SNP = qtl_info$QTL_SNP, GWAS_P = qtl_info$QTL_P)
+        qtl2_info <- getQTLinfo(q2, hit_info_qtl1)
+        # get QTL betas for lead SNP in QTL 2
+        hit_info_qtl2 <- data.frame(locus = hit_info$locus, GWAS_SNP = qtl2_info$QTL_SNP, GWAS_P = qtl2_info$QTL_P)
+        qtl_info <- getQTLinfo(q, hit_info_qtl2)
         if( !is.null(sig.level ) ){
             qtl2_info <- filter(qtl2_info, QTL_P < sig.level)
             q2 <- q2[ qtl2_info$gene ]
@@ -644,7 +660,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
             return( list(df = coloc_df, object = coloc_object) )
         })
     }
-    if( is.null(coloc_res) ){ return(NULL) }
+    if( is.null(unlist(coloc_res)) ){ return(NULL) }
 
     message_("COLOC finished")
     coloc_df <- map_df(coloc_res, "df", .id = "gene") %>% 
@@ -693,7 +709,7 @@ calc_LD <- function( coloc_res ){
 
 
 option_list <- list(
-        make_option(c('-o', '--outFolder'), help='the path to the output file', default = ""),
+        make_option(c('-o', '--outFolder'), help='the path to the output file', default = "."),
         make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database", default = NULL ),
         make_option(c('--qtl', '-q'), help = "the dataset ID for a QTL dataset in the GWAS/QTL database"),
         make_option(c('--qtl2', '-r'), help = "a second QTL dataset - using this will trigger QTL-QTL COLOC", default = NULL),
@@ -703,6 +719,7 @@ option_list <- list(
         make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE),
         make_option(c('--lowmem', '-b'), help = "do not save COLOC objects, just the summaries", action = "store_true", default = FALSE),
         make_option(c('--threads', '-c'), help = "how many threads to use", default = 1),
+        make_option(c('--MAF', '-m'), help = "path to custom MAF file", default = NULL),
         make_option(c('--verbose', '-v'), help = "if selected then output more information", action = "store_true", default = FALSE)
 )
 
@@ -720,15 +737,22 @@ target_file <- opt$targets
 low_mem <- opt$lowmem
 threads <- opt$threads
 verbose <- opt$verbose
-#maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz"
-maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz"
+MAF <- opt$MAF
+
+# default MAF is 1000g phase 3v5
+# but users can provide their own
+# must have columns chr, pos, snp, MAF
+if( is.null(MAF) ){
+    #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz"
+    MAF <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz"
+}
 
 if(verbose){print(opt)}
 
 # load in MAF table
-if( !exists("maf_1000g")){
-
-  maf_1000g <- loadMAF(maf_1000gp3)
+if( !exists("maf_1000g") ){
+  message(paste0("loading MAF from ", MAF ) )
+  maf_1000g <- loadMAF(MAF)
   # load in liftover chain
   chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain")    
 }
@@ -794,16 +818,19 @@ main <- function(){
     
     all_res <- purrr::map_df(all_coloc, "full_res")
     if(!is.null(target_file) ){
-        outFile <- file.path(outFolder, gsub(".tsv.gz", "_COLOC.tsv.gz", basename(target_file) ) )
-        all_res <- dplyr::bind_cols(targets, all_res %>% select(-locus) )
+        outFile <- file.path(outFolder, gsub(".tsv", "_COLOC.tsv", basename(target_file) ) )
+        # here if any of the COLOCs did not proceed then the target df will be larger than the all_res results        
+
+        #all_res <- dplyr::bind_cols(targets, all_res %>% select(-locus) )
     }else{
-        outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv")
+        outFile <- paste0(outFolder,"/", qtl_dataset, "_", gwas_dataset, "_COLOC.tsv")
         names(all_obj) <- top_loci$locus
     }
+    dir.create(outFolder,  showWarnings = FALSE)
     all_res <- dplyr::arrange(all_res, desc(PP.H4.abf)  )
     message("finished COLOC pipeline")
     message_verbose(paste0("writing output to ", outFile))
-    readr::write_tsv(all_res, path = outFile)
+    readr::write_tsv(all_res, file = outFile)
     # save COLOC objects for plotting
     if( !low_mem ){
         all_obj <- purrr::map(all_coloc, "coloc_object")

From 483138b452c777112ed7cb768e45ec82bb38d358 Mon Sep 17 00:00:00 2001
From: Jack Humphrey <jack.humphrey@mssm.edu>
Date: Tue, 31 Mar 2026 19:31:03 -0400
Subject: [PATCH 7/7] tweaks to qvalue; adding significance filter to COLOC
 with -s option

---
 COLOC/Snakefile             | 10 +++++-----
 COLOC/cluster.yaml          |  4 ++--
 COLOC/scripts/run_COLOC.R   | 24 +++++++++++++++++++-----
 qvalue_sharing/Snakefile    |  4 ++--
 qvalue_sharing/run_qvalue.R |  6 +++---
 5 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/COLOC/Snakefile b/COLOC/Snakefile
index 74284c9..b6387fe 100644
--- a/COLOC/Snakefile
+++ b/COLOC/Snakefile
@@ -5,7 +5,7 @@ gwas_data = config["gwas"]
 qtl_data = config["qtl"]
 outFolder = config["outFolder"]
 geneMeta = config["geneMeta"]
-
+sig_threshold = config["sig_threshold"]
 # sanity check - can the GWAS and QTL data be found in the database?
 gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1)
 qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2)
@@ -29,7 +29,7 @@ rule all:
         #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data)
         outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
         outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz",
-        outFolder + "pairwise_QTL_results.tsv.gz"
+        #outFolder + "pairwise_QTL_results.tsv.gz"
         #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
         #outFolder + "all_COLOC_summary_results.tsv.gz"
 
@@ -41,8 +41,8 @@ rule run_COLOC:
         script = "scripts/run_COLOC.R"
     shell:
         "mkdir -p {outFolder}{wildcards.GWAS};"
-        "ml R/4.0.3;" #ml arrow;"
-        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ "
+        "ml R;" #ml arrow;"
+        "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ -s 1e-5 --verbose --lowmem"
    
 rule merge_COLOC_snp_level:
     input:
@@ -53,7 +53,7 @@ rule merge_COLOC_snp_level:
     params:
         script = "scripts/merge_COLOC_results.R"
     shell:
-        "ml R/3.6.0;" # LDLINKR only installed on 3.6
+        "ml R;" # LDLINKR only installed on 3.6
         "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" 
         "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} "
 
diff --git a/COLOC/cluster.yaml b/COLOC/cluster.yaml
index e5eddc7..bccd9de 100644
--- a/COLOC/cluster.yaml
+++ b/COLOC/cluster.yaml
@@ -1,7 +1,7 @@
 __default__:
   queue: premium
   cores: 1
-  mem: 24000
+  mem: 48000
   time: '120'
   name: $(basename $(pwd)):{rule}:{wildcards}
   output: logs/{rule}:{wildcards}.stdout
@@ -9,5 +9,5 @@ __default__:
   himem: ""
 pairwise_QTL_COLOC:
   cores: 4
-  mem: 8000
+  mem: 16000
   time: 8640
diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R
index 1904335..1758c26 100644
--- a/COLOC/scripts/run_COLOC.R
+++ b/COLOC/scripts/run_COLOC.R
@@ -367,8 +367,8 @@ extractQTL_tabix <- function(qtl, coord){
 
 # extract all nominal QTL P-values overlapping the flanked GWAS hit
 # split by Gene being tested
-# sig_level doesn't do anything yet
-extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets = NULL){
+# now implemented sig_level filtering for minimum P-value for each phenotype
+extractQTL <- function(qtl, coord, sig_level = NULL, force_maf = FALSE, targets = NULL){
     # variables stored in qtl
     # if qtl type is parquet then read in parquet files
     # if qtl type is tabix then query the coordinates through tabix 
@@ -423,7 +423,18 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets
     if( pvalCol == "log10_p"){
         result$pvalues <- 10^result$pvalues
     }
-
+    # P-value thresholding - methylation QTLs, there are way too many sites to test all of them
+    if( !is.null(sig_level) ){
+        message_verbose(paste0(" currently: ", length(unique(result$gene) ), " phenotypes in window" ) )
+        message_verbose(paste0(" removing phenotypes with P < ", sig_level))
+        pheno_p <- result %>% dplyr::group_by(gene) %>% dplyr::slice_min(pvalues, n = 1, with_ties = FALSE) %>% dplyr::filter(pvalues < as.numeric(sig_level) )
+        result <- dplyr::filter(result, gene %in% pheno_p$gene)
+        if(nrow(result) == 0){
+            message_verbose(" 0 phenotypes remaining after P thresholding")
+            return(NULL)
+        }
+        message_verbose(paste0("retaining ", nrow(result), " associations from ", nrow(pheno_p), " phenotypes" ))
+    }
     # deal with MAF - meta-analysis outputs won't have it
     # this requires "chr" to be present in QTL data
     #if( debug == TRUE){ result$MAF <- NA }else{ # why do we do this? breaks it when debug is TRUE
@@ -582,7 +593,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file
         qtl_range <- joinCoords(qtl_coord, flank = 1e6)
     }
     message_verbose("extracting QTLs")
-    q <- extractQTL(qtl, qtl_range, targets = target.file[1])
+    q <- extractQTL(qtl, qtl_range, targets = target.file[1], sig_level = sig.level)
     #print(names(q[[1]]))
     if( is.null(q) ){ return(NULL) }
     qtl_info <- getQTLinfo(q, hit_info)
@@ -827,7 +838,10 @@ main <- function(){
         names(all_obj) <- top_loci$locus
     }
     dir.create(outFolder,  showWarnings = FALSE)
-    all_res <- dplyr::arrange(all_res, desc(PP.H4.abf)  )
+    # in case all COLOCs are null for some reason
+    if(nrow(all_res) > 0){
+        all_res <- dplyr::arrange(all_res, desc(PP.H4.abf)  )
+    }
     message("finished COLOC pipeline")
     message_verbose(paste0("writing output to ", outFile))
     readr::write_tsv(all_res, file = outFile)
diff --git a/qvalue_sharing/Snakefile b/qvalue_sharing/Snakefile
index f6c9c85..4b086d5 100644
--- a/qvalue_sharing/Snakefile
+++ b/qvalue_sharing/Snakefile
@@ -18,7 +18,7 @@ rule run_qvalue:
     output:
         outFolder + "{source}:{target}:{threshold}.qvalue.tsv"
     shell:
-        "ml R/3.6.0;"
+        "ml R;"
         "Rscript {params.script} -s {params.source} -t {params.target} -q {threshold} -o {outFolder}"
 
 rule merge_qvalue:
@@ -29,5 +29,5 @@ rule merge_qvalue:
     output:
         outFolder + "all_qvalue_merged." + threshold + ".tsv"
     shell:
-        "ml R/3.6.0;"
+        "ml R;"
         "Rscript {params.script} -o {outFolder} -q {threshold}"
diff --git a/qvalue_sharing/run_qvalue.R b/qvalue_sharing/run_qvalue.R
index 50a372b..38c6511 100644
--- a/qvalue_sharing/run_qvalue.R
+++ b/qvalue_sharing/run_qvalue.R
@@ -11,10 +11,10 @@ pullData <- function(dataset, type = "GWAS"){
     message(Sys.time()," * reading GWAS database from ", db_path)
     stopifnot( file.exists(db_path) )
     if( type == "GWAS"){
-        n_sheet <- 3
+        n_sheet <- 2
     }
     if( type == "QTL"){
-        n_sheet <- 2
+        n_sheet <- 3
     }
     gwas_db <- suppressMessages(readxl::read_excel(db_path, sheet = n_sheet,na= c("", "-","NA")))
     
@@ -78,7 +78,7 @@ extractTargetQTL <- function(qtl, chr, source_qtls){
     stopifnot( file.exists(qtl$full_path) )
      
     # read in QTL
-    cmd <- paste0("ml bcftools; tabix ", qtl$full_path, " ", chr)
+    cmd <- paste0("ml tabix; tabix ", qtl$full_path, " ", chr)
     message( " * ", cmd) 
     result <- data.table::fread( cmd = cmd, nThread = 8 )
     if( nrow(result) == 0){