From 509ef85bc0d6f67cceec1dd57d2cc554b9d8a8ad Mon Sep 17 00:00:00 2001 From: Jack Humphrey Date: Fri, 23 Feb 2024 16:10:04 -0500 Subject: [PATCH 1/7] added QTL-QTL coloc - still fixing bugs --- COLOC/scripts/run_COLOC.R | 203 ++++++++++++++++++++++---------------- 1 file changed, 119 insertions(+), 84 deletions(-) diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 69bc401..19b6f96 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -455,7 +455,44 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ return(res_split) } - +# for each gene extract top QTL SNP +getQTLinfo <- function(q, hit_info){ + if( all( c("beta", "varbeta") %in% names(q[[1]]) ) ){ + qtl_info <- q %>% + map_df( ~{ + as.data.frame(.x, stringsAsFactors=FALSE) %>% + arrange(pvalues) %>% head(1) %>% + select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_Beta = beta, QTL_SE = varbeta, QTL_MAF = MAF, QTL_chr, QTL_pos) %>% + mutate( QTL_SE = sqrt(QTL_SE) ) # get SE back from Variance + }) + # get out beta, se and P for GWAS SNP + gwas_info <- q %>% + map_df( ~{ + df <- as.data.frame(.x, stringsAsFactors=FALSE) %>% + filter(snp == hit_info$GWAS_SNP ) + if( nrow(df) == 1){ + df %>% + select(GWAS_SNP_Beta = beta, + GWAS_SNP_SE = varbeta, + GWAS_SNP_P = pvalues ) %>% + mutate(GWAS_SNP_SE = sqrt(GWAS_SNP_SE) ) + }else{ + tibble(GWAS_SNP_Beta = NA, GWAS_SNP_SE = NA, GWAS_SNP_P = NA) + } + }) + qtl_info <- bind_cols(qtl_info, gwas_info) + print(head(qtl_info) ) + }else{ + qtl_info <- q %>% + map_df( ~{ + as.data.frame(.x, stringsAsFactors=FALSE) %>% + arrange(pvalues) %>% head(1) %>% + select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_MAF = MAF, QTL_chr, QTL_pos) + }) + + } + return(qtl_info) +} # running COLOC between a GWAS locus and all eQTLs within 1MB either side @@ -467,7 +504,8 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ # output a list of objects: ## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value ## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting. -runCOLOC <- function(gwas, qtl, hit){ +## sig.level now filters out QTLs with P > sig.level +runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){ message(" * Analysing LOCUS: ", hit$locus ) # hit is a dataframe containing "snp", "chr", "pos", "locus" @@ -479,24 +517,12 @@ runCOLOC <- function(gwas, qtl, hit){ g <- extractGWAS(gwas, hit_range) # get hit info from GWAS hit_snp <- hit$snp - # this assumes that the hit SNP is within the g object - # if the top hit doesn't have an RSID or cannot be MAF matched then this is violated - # I will have to use the hit object instead and accept that not all top_loci lists include the MAF, P value etc. - #if( hit_snp == hit$snp ){ - #hit_info <- as.data.frame(g, stringsAsFactors = FALSE) %>% filter(snp == hit_snp) %>% - # select(GWAS_SNP = snp, GWAS_P = pvalues, GWAS_effect_size = beta, GWAS_MAF = MAF) %>% - # mutate(locus = hit$locus) %>% select(locus, everything() ) - #}else{ - # now just use top loci - + # problem when no p column in hit! hit_info <- hit %>% select(locus, GWAS_SNP = snp, GWAS_P = p) - #} - # if GWAS is hg19 then lift over to hg38 - # use the lifted over position in hit_info if(gwas$build != qtl$build){ qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build) }else{ @@ -505,57 +531,39 @@ runCOLOC <- function(gwas, qtl, hit){ # record the position of the GWAS top SNP hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr hit_info$GWAS_pos <- splitCoords(qtl_coord)$end - - qtl_range <- joinCoords(qtl_coord, flank = 1e6) message(" * extracting QTLs") q <- extractQTL(qtl, qtl_range) print(names(q[[1]])) if( is.null(q) ){ return(NULL) } - # for each gene extract top QTL SNP - if( all( c("beta", "varbeta") %in% names(q[[1]]) ) ){ - qtl_info <- q %>% - map_df( ~{ - as.data.frame(.x, stringsAsFactors=FALSE) %>% - arrange(pvalues) %>% head(1) %>% - select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_Beta = beta, QTL_SE = varbeta, QTL_MAF = MAF, QTL_chr, QTL_pos) %>% - mutate( QTL_SE = sqrt(QTL_SE) ) # get SE back from Variance - }) - # get out beta, se and P for GWAS SNP - gwas_info <- q %>% - map_df( ~{ - df <- as.data.frame(.x, stringsAsFactors=FALSE) %>% - filter(snp == hit_info$GWAS_SNP ) - if( nrow(df) == 1){ - df %>% - select(GWAS_SNP_Beta = beta, - GWAS_SNP_SE = varbeta, - GWAS_SNP_P = pvalues ) %>% - mutate(GWAS_SNP_SE = sqrt(GWAS_SNP_SE) ) - }else{ - tibble(GWAS_SNP_Beta = NA, GWAS_SNP_SE = NA, GWAS_SNP_P = NA) - } - }) - qtl_info <- bind_cols(qtl_info, gwas_info) - print(head(qtl_info) ) - }else{ - qtl_info <- q %>% - map_df( ~{ - as.data.frame(.x, stringsAsFactors=FALSE) %>% - arrange(pvalues) %>% head(1) %>% - select(gene, QTL_SNP = snp, QTL_P = pvalues, QTL_MAF = MAF, QTL_chr, QTL_pos) - }) + qtl_info <- getQTLinfo(q, hit_info) + # if sig filtering requested + if( !is.null(sig.level ) ){ + qtl_info <- filter(qtl_info, QTL_P < sig.level) + q <- q[ qtl_info$gene ] + message(" * ", length(q), " features in QTL") + } + if( !is.null(qtl2) ){ + # if second QTL dataset requested + q2 <- extractQTL(qtl2, qtl_range) + if( is.null(q2) ){ return(NULL) } + qtl2_info <- getQTLinfo(q2, hit_info) + if( !is.null(sig.level ) ){ + qtl2_info <- filter(qtl2_info, QTL_P < sig.level) + q2 <- q2[ qtl2_info$gene ] + } + message(" * ", length(q2), " features in QTL 2") } # actually run COLOC message(" * running COLOC") - - # run COLOC on each QTL gene # return the results table and an object containing g and q + # GWAS-QTL COLOC + if( is.null(qtl2) ){ + message(" * GWAS-QTL COLOC") coloc_res <- purrr::map( q, ~{ - # if QTL has no overlapping SNPs with GWAS summary then return NULL if( length(intersect(g$snp, .x$snp) ) == 0 ){ message("Hold up! GWAS and QTL have no SNPs in common") return(NULL) @@ -564,10 +572,41 @@ runCOLOC <- function(gwas, qtl, hit){ # add in g and q to coloc object gq <- dplyr::inner_join(as.data.frame(g),as.data.frame(.x), by = "snp", suffix = c(".gwas", ".qtl") ) coloc_object$results <- dplyr::inner_join(coloc_object$results, gq, by = "snp" ) - # pull out coloc summary as data.frame coloc_df <- as.data.frame(t(coloc_object$summary), stringsAsFactors = FALSE) return( list(df = coloc_df, object = coloc_object) ) }) + }else{ + message(" * QTL-QTL COLOC") + # remove features with lead SNPs > 1e-5 + # test all pairs of QTL features in the locus + qtl_combos <- expand.grid(1:length(q), 1:length(q2) ) + message(" * testing ", nrow(qtl_combos), " pairs of QTLs") + # qtl_info is matched on to COLOC results using gene names - but for QTL-QTL there are two genes, so instead use the index of the pair + qtl_info <- + full_join( + qtl_info[qtl_combos$Var1,] %>% mutate(index = 1:nrow(qtl_combos) ), + qtl2_info[qtl_combos$Var2,] %>% mutate(index = 1:nrow(qtl_combos) ), + by = "index", + suffix = c(".qtl1", ".qtl2") ) %>% + dplyr::mutate(gene = as.character(index)) %>% + select(-index) + + coloc_res <- + purrr::map( 1:nrow(qtl_combos), ~{ + q_1 = q[[ qtl_combos[.x, "Var1"] ]] + q_2 = q2[[ qtl_combos[.x, "Var2"] ]] + message( " * ", .x, ": ", unique(q_1$gene), " vs ", unique(q_2$gene) ) + if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){ + message("Hold up! GWAS and QTL have no SNPs in common") + return(NULL) + } + coloc_object <- coloc.abf(dataset1 = q_1, dataset2 = q_2) + qq <- dplyr::inner_join(as.data.frame(q_1),as.data.frame(q_2), by = "snp", suffix = c(".qtl1", ".qtl2") ) + coloc_object$results <- dplyr::inner_join(coloc_object$results, qq, by = "snp" ) + coloc_df <- as.data.frame(t(coloc_object$summary), stringsAsFactors = FALSE) + return( list(df = coloc_df, object = coloc_object) ) + }) + } if( is.null(coloc_res) ){ return(NULL) } message(" * COLOC finished") @@ -578,7 +617,8 @@ runCOLOC <- function(gwas, qtl, hit){ # bind coloc_res to other tables full_res <- full_join(qtl_info, coloc_df, by = "gene") full_res <- full_join(hit_info, full_res, by = "locus") - + # remove rows with no coloc run + full_res <- filter(full_res, !is.na(PP.H4.abf) ) # add pairwise LD between GWAS and QTL SNP #full_res <- calc_LD(full_res) @@ -618,6 +658,8 @@ option_list <- list( make_option(c('-o', '--outFolder'), help='the path to the output file', default = ""), make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database" ), make_option(c('--qtl', '-q'), help = "the dataset ID for a QTL dataset in the GWAS/QTL database"), + make_option(c('--qtl2', '-r'), help = "a second QTL dataset - using this will trigger QTL-QTL COLOC", default = NULL), + make_option(c('--sig', '-s'), help = "the minimum p-value a QTL should have for testing", default = NULL), make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE) ) @@ -628,50 +670,41 @@ opt <- parse_args(option.parser) outFolder <- opt$outFolder gwas_dataset <- opt$gwas qtl_dataset <- opt$qtl +qtl_dataset_2 <- opt$qtl2 debug <- opt$debug -#gwas_prefix <- "/sc/arion/projects/als-omics/ALS_GWAS/Nicolas_2018/processed/Nicolas_2018_processed_" -#qtl_prefix <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/QTL-mapping-pipeline/results/LumbarSpinalCord_expression/peer30/LumbarSpinalCord_expression_peer30" -#qtl_prefix <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/QTL-mapping-pipeline/results/LumbarSpinalCord_splicing/peer20/LumbarSpinalCord_splicing_peer20" +sig <- opt$sig -#hits_file <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/downstream-QTL/COLOC/Nicolas_2018/Nicolas_2018_hits_1e-7.tsv" -#outFile <- "test_coloc_results.tsv" - -#options(echo = TRUE) - -# load in data -# extract top GWAS loci -# for each locus run COLOC -maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" +#maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" -# load in MAF table -if( !exists("maf_1000g")){ - - maf_1000g <- loadMAF(maf_1000gp3) - # load in liftover chain - chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain") - } main <- function(){ - - #gwas_dataset <- "Kunkle_2019" - #gwas_dataset <- "Nalls23andMe_2019" #gwas_dataset <- "Marioni_2018" #qtl_dataset <- "Microglia_THA" gwas <- pullData(gwas_dataset, "GWAS") qtl <- pullData(qtl_dataset, "QTL") + if( !is.null(qtl_dataset_2) ){ + qtl2 <- pullData(qtl_dataset_2, "QTL") + } + top_loci <- extractLoci(gwas) - # for testing - #top_loci <- top_loci[2,] + + # load in MAF table + if( !exists("maf_1000g")){ + + maf_1000g <- loadMAF(maf_1000gp3) + # load in liftover chain + chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain") + } + if( debug == TRUE){ save.image("debug.RData") } - all_coloc <- purrr::map( 1:nrow(top_loci), ~{ - res <- runCOLOC(gwas, qtl, hit = top_loci[.x,]) + res <- runCOLOC(gwas, qtl, qtl2, hit = top_loci[.x,], sig.level = sig) if(is.null(res) ){return(NULL) } return(res) } @@ -679,12 +712,13 @@ main <- function(){ all_res <- map_df(all_coloc, "full_res") all_obj <- map(all_coloc, "coloc_object") names(all_obj) <- top_loci$locus - - - # arrange by H4 all_res <- arrange(all_res, desc(PP.H4.abf) ) outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") + if( !is.null(qtl_dataset_2)){ + outFile <- paste0(outFolder, qtl_dataset, "_", qtl_dataset_2, "_", gwas_dataset, "_COLOC.tsv") + } + readr::write_tsv(all_res, path = outFile) # save COLOC objects for plotting save(all_obj, file = gsub("tsv", "RData", outFile)) @@ -692,3 +726,4 @@ main <- function(){ main() + From c33e39eeb8cf009e147f275b30b25acf07cfae98 Mon Sep 17 00:00:00 2001 From: Jack Humphrey Date: Fri, 29 Mar 2024 18:48:31 -0400 Subject: [PATCH 2/7] added qtl coloc to the snakemake pipeline --- COLOC/Snakefile | 39 +++++- COLOC/scripts/prepare_pairwise_qtl_coloc.R | 73 +++++++++++ COLOC/scripts/run_COLOC.R | 146 +++++++++++++-------- 3 files changed, 199 insertions(+), 59 deletions(-) create mode 100644 COLOC/scripts/prepare_pairwise_qtl_coloc.R diff --git a/COLOC/Snakefile b/COLOC/Snakefile index a503f63..74284c9 100644 --- a/COLOC/Snakefile +++ b/COLOC/Snakefile @@ -7,22 +7,29 @@ outFolder = config["outFolder"] geneMeta = config["geneMeta"] # sanity check - can the GWAS and QTL data be found in the database? -gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2) -qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1) +gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1) +qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2) gwas_check = all(i in gwas_df["dataset"].tolist() for i in gwas_data ) qtl_check = all(i in qtl_df["dataset"].tolist() for i in qtl_data ) -if not all([gwas_check, qtl_check]): - print(" * QTL and/or GWAS cannot be found in database") +if not all([gwas_check]): + print(" * GWAS cannot be found in database") sys.exit() +if not all([qtl_check]): + print(" * QTL cannot be found in database") + print(qtl_check) + sys.exit() + + shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;") rule all: input: #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data) outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz", outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz", + outFolder + "pairwise_QTL_results.tsv.gz" #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data ) #outFolder + "all_COLOC_summary_results.tsv.gz" @@ -46,7 +53,7 @@ rule merge_COLOC_snp_level: params: script = "scripts/merge_COLOC_results.R" shell: - "ml R/4.0.3;" + "ml R/3.6.0;" # LDLINKR only installed on 3.6 "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} " @@ -61,3 +68,25 @@ rule create_LD_tables: shell: "conda activate echoR;" "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" + +rule prepare_QTL_COLOC: + input: + outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz" + output: + outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" + params: + script = "scripts/prepare_pairwise_qtl_coloc.R" + shell: + "ml R/4.2.0;" + "Rscript {params.script} -i {input} -o {output} -m across -p 0.8" + +rule run_QTL_COLOC: + input: + outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" + output: + outFolder + "pairwise_QTL_results.tsv.gz" + params: + script = "scripts/run_COLOC.R" + shell: + "ml R/4.2.0;" + "Rscript {params.script} -t {input} -o {output}" diff --git a/COLOC/scripts/prepare_pairwise_qtl_coloc.R b/COLOC/scripts/prepare_pairwise_qtl_coloc.R new file mode 100644 index 0000000..1c5028e --- /dev/null +++ b/COLOC/scripts/prepare_pairwise_qtl_coloc.R @@ -0,0 +1,73 @@ +library(optparse) + +option_list <- list( + make_option(c('-o', '--outFile'), help='the path to the output file', default = ""), + make_option(c('-i', '--inFile'), help='the path to the input file', default = ""), + make_option(c('-m', '--mode'), help='mode - across or within?', default = "across"), + make_option(c('-p', '--pp'), help='min PP4', default = 0.5), + make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE) +) + +option.parser <- OptionParser(option_list=option_list) +opt <- parse_args(option.parser) + +inFile <- opt$inFile +outFile <- opt$outFile +mode <- opt$mode +PP4 <- opt$pp +library(tidyverse) + +#all_coloc <- read_tsv("../all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz") +all_coloc <- read_tsv(inFile) + +all_coloc$distance <- abs(all_coloc$GWAS_pos - all_coloc$QTL_pos) +all_coloc$distance_filter <- (!is.na(all_coloc$LD) & all_coloc$LD > 0.1) | all_coloc$distance < 5e5 + +qtl_coloc_input <- all_coloc %>% + filter(distance_filter == TRUE & PP.H4.abf > PP4) %>% + select(GWAS, QTL, locus, feature) %>% + distinct() + +message(" * ", nrow(qtl_coloc_input), " phenotypes colocalize at PP4 > ", PP4) +stopifnot( nrow(qtl_coloc_input) > 0) + + +shared_loci <- select(qtl_coloc_input, GWAS, locus, QTL) %>% + distinct() %>% + group_by(GWAS, locus) %>% + tally() %>% + filter(n > 1) +message(" * ", nrow(shared_loci), " loci are shared between QTL types") +stopifnot( nrow(shared_loci) > 0) + +qtl_coloc_input <- filter(qtl_coloc_input, locus %in% shared_loci$locus) + +# get all pairwise comparisons required +# make optional - do you want to test features within a dataset or just across? +qtl_coloc_split <- qtl_coloc_input %>% + split(paste0(.$GWAS, ":", .$locus)) %>% + map_df( ~{ + d <- select(.x, GWAS,locus, QTL, feature) %>% distinct() + qtl_pairs <- combn(unique(paste(d$QTL, d$feature) ), 2) + locus_df <- select(d, GWAS, locus) + pair_df <- + data.frame(qtl_1 = qtl_pairs[1,], qtl_2 = qtl_pairs[2,] ) %>% + tidyr::separate(qtl_1, into = c("qtl_1", "feature_1"), sep = " ") %>% + tidyr::separate(qtl_2, into = c("qtl_2", "feature_2"), sep = " ") %>% + mutate(GWAS = unique(d$GWAS), locus = unique(d$locus)) %>% + select(GWAS, locus, everything() ) + + }) %>% + distinct() + +if( mode == "across"){ + +qtl_coloc_split <- qtl_coloc_split %>% + mutate( within = qtl_1 == qtl_2) %>% filter(within == FALSE) +} + +write_tsv(qtl_coloc_split, outFile) #"test_eQTL_edQTL_pairwise_input.tsv") + + + + diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 19b6f96..553c8c0 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -36,6 +36,7 @@ joinCoords <- function(input, flank = 0){ coord <- splitCoords(input) coord$start <- coord$start - flank coord$end <- coord$end + flank + if( coord$start < 0){coord$start <- 0} coord <- paste0(coord$chr, ":", coord$start, "-", coord$end) return(coord) } @@ -155,7 +156,7 @@ matchMAF <- function(data, maf){ # match on RSID message(" * before joining MAF: ", nrow(data), " SNPs") data$MAF <- maf$MAF[match(data$snp, maf$snp)] - matched <- data[ !is.na(data$MAF) ,] + matched <- data[ !is.na(data$MAF) & data$MAF >0 & data$MAF < 1,] message(" * after joining MAF: ", nrow(matched), " SNPs") if(nrow(matched) == 0 ){ print(head(data) ) @@ -222,7 +223,6 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da #return(result) # deal with MAF if missing - if( debug == TRUE){ result$MAF <- NA }else{ if( is.na(mafCol) | force_maf == TRUE ){ message(" * MAF not present - using 1000 Genomes MAF") # how to get this in to the function? it can't find maf_1000g @@ -231,11 +231,10 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da message(" * using supplied MAF") names(result)[names(col_dict) == mafCol] <- "MAF" } - } # convert standard error to the variance # se is standard error - not standard deviation! result$varbeta <- result$varbeta^2 - print(head(result)) + #print(head(result)) #save.image("debug.RData") result <- dplyr::select( result, snp, pvalues, beta, varbeta, MAF, chr, pos, A1, A2) @@ -348,7 +347,7 @@ extractQTL_tabix <- function(qtl, coord){ # extract all nominal QTL P-values overlapping the flanked GWAS hit # split by Gene being tested # sig_level doesn't do anything yet -extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ +extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets = NULL){ # variables stored in qtl # if qtl type is parquet then read in parquet files # if qtl type is tabix then query the coordinates through tabix @@ -379,7 +378,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ columns <- colnames(data.table::fread( cmd = cmd )) col_dict <- setNames(1:length(columns), columns) - print(col_dict) + #print(col_dict) #stopifnot( ncol(result) == length(col_dict) ) col_dict <- col_dict[1:ncol(result)] # hacky workaround for now names(result)[names(col_dict) == phenoCol] <- "gene" @@ -410,12 +409,12 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ #stopifnot( "chr" %in% names(col_dict) ) names(result)[names(col_dict) == "chr"] <- "chr" result <- matchMAF(result, maf = maf_1000g) - print(head(result) ) + #print(head(result) ) }else{ message(" * using supplied MAF") names(result)[names(col_dict) == mafCol] <- "MAF" - print(col_dict) - print(head(result) ) + #print(col_dict) + #print(head(result) ) } } # deal with edge cases - 1 gene and the gene id is NA @@ -426,8 +425,8 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ result <- dplyr::filter(result, !is.na(gene) ) } # if MAF needs matching then chrCol will be "chr" - change it back to "QTL_chr" - print( names(col_dict) ) - print( names(col_dict) == chrCol ) + #print( names(col_dict) ) + #print( names(col_dict) == chrCol ) names(result)[which(names(col_dict) == chrCol) ] <- "QTL_chr" names(result)[which(names(col_dict) == posCol) ] <- "QTL_pos" @@ -439,10 +438,17 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ }else{ res_subset <- dplyr::select(result, gene, snp, pvalues, MAF, QTL_chr, QTL_pos) } - print(head(res_subset) ) + #print(head(res_subset) ) print("passed this point") #dplyr::filter( pos >= coord_split$start & pos <= coord_split$end) - + + if( !is.null(targets) ){ + res_subset <- dplyr::filter(res_subset, gene == targets) + print(paste0(length(unique(res_subset$gene)), " target features remain" ) ) + if( nrow(res_subset) == 0 ){ + return(NULL) + } + } # split by gene, convert to list object res_split <- split(res_subset, res_subset$gene) %>% @@ -505,7 +511,7 @@ getQTLinfo <- function(q, hit_info){ ## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value ## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting. ## sig.level now filters out QTLs with P > sig.level -runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){ +runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL){ message(" * Analysing LOCUS: ", hit$locus ) # hit is a dataframe containing "snp", "chr", "pos", "locus" @@ -513,8 +519,10 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){ hit_range <- joinCoords(hit, flank = 1e6) # extract GWAS and QTL summary for given coord - message(" * extracting GWAS") - g <- extractGWAS(gwas, hit_range) + if( is.null(qtl2) ){ + message(" * extracting GWAS") + g <- extractGWAS(gwas, hit_range) + } # get hit info from GWAS hit_snp <- hit$snp @@ -534,8 +542,8 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){ qtl_range <- joinCoords(qtl_coord, flank = 1e6) message(" * extracting QTLs") - q <- extractQTL(qtl, qtl_range) - print(names(q[[1]])) + q <- extractQTL(qtl, qtl_range, targets = target.file[1]) + #print(names(q[[1]])) if( is.null(q) ){ return(NULL) } qtl_info <- getQTLinfo(q, hit_info) # if sig filtering requested @@ -547,7 +555,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){ if( !is.null(qtl2) ){ # if second QTL dataset requested - q2 <- extractQTL(qtl2, qtl_range) + q2 <- extractQTL(qtl2, qtl_range, targets = target.file[2]) if( is.null(q2) ){ return(NULL) } qtl2_info <- getQTLinfo(q2, hit_info) if( !is.null(sig.level ) ){ @@ -596,7 +604,9 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL){ q_1 = q[[ qtl_combos[.x, "Var1"] ]] q_2 = q2[[ qtl_combos[.x, "Var2"] ]] message( " * ", .x, ": ", unique(q_1$gene), " vs ", unique(q_2$gene) ) - if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){ + #message( head(q_1) ) + #message( head(q_2) ) + if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){ message("Hold up! GWAS and QTL have no SNPs in common") return(NULL) } @@ -656,10 +666,12 @@ calc_LD <- function( coloc_res ){ option_list <- list( make_option(c('-o', '--outFolder'), help='the path to the output file', default = ""), - make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database" ), + make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database", default = NULL ), make_option(c('--qtl', '-q'), help = "the dataset ID for a QTL dataset in the GWAS/QTL database"), make_option(c('--qtl2', '-r'), help = "a second QTL dataset - using this will trigger QTL-QTL COLOC", default = NULL), make_option(c('--sig', '-s'), help = "the minimum p-value a QTL should have for testing", default = NULL), + make_option(c('--loci', '-l'), help = "text file of loci names", default = NULL), + make_option(c('--targets', '-t'), help = "TSV file of features, loci and GWAS for targeted analysis", default = NULL), make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE) ) @@ -673,55 +685,81 @@ qtl_dataset <- opt$qtl qtl_dataset_2 <- opt$qtl2 debug <- opt$debug sig <- opt$sig +loci <- opt$loci +targets <- opt$targets + #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" -main <- function(){ - #gwas_dataset <- "Marioni_2018" - #qtl_dataset <- "Microglia_THA" - - gwas <- pullData(gwas_dataset, "GWAS") - qtl <- pullData(qtl_dataset, "QTL") - - if( !is.null(qtl_dataset_2) ){ - qtl2 <- pullData(qtl_dataset_2, "QTL") - } - - top_loci <- extractLoci(gwas) - - - # load in MAF table - if( !exists("maf_1000g")){ +# load in MAF table +if( !exists("maf_1000g")){ - maf_1000g <- loadMAF(maf_1000gp3) - # load in liftover chain - chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain") - } + maf_1000g <- loadMAF(maf_1000gp3) + # load in liftover chain + chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain") +} +main <- function(){ if( debug == TRUE){ save.image("debug.RData") } - - all_coloc <- purrr::map( + # regular GWAS-QTL COLOC + if( is.null(targets) ){ + gwas <- pullData(gwas_dataset, "GWAS") + qtl <- pullData(qtl_dataset, "QTL") + + if( !is.null(qtl_dataset_2) ){ + qtl2 <- pullData(qtl_dataset_2, "QTL") + }else{ qtl2 <- NULL} + top_loci <- extractLoci(gwas) + print(paste0(" testing ", nrow(top_loci), " loci ") ) + + all_coloc <- purrr::map( 1:nrow(top_loci), ~{ res <- runCOLOC(gwas, qtl, qtl2, hit = top_loci[.x,], sig.level = sig) if(is.null(res) ){return(NULL) } return(res) + }) + } + # QTL-QTL COLOC with pairwise targets + if(!is.null(targets) ){ + targets <- read_tsv(targets) + print( paste0(" * testing ", nrow(targets), " pairs of QTLs" ) ) + # for testing + #targets <- head(targets, 3) + all_coloc <- purrr::map( + 1:nrow(targets), ~{ + print( paste0(" * pair ", .x , " of ", nrow(targets) ) ) + pair_df <- targets[.x,] + gwas <- pullData(pair_df$GWAS, "GWAS") + locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus) + qtl1 <- pullData(pair_df$qtl_1, "QTL") + qtl2 <- pullData(pair_df$qtl_2, "QTL") + features <- c(pair_df$feature_1, pair_df$feature_2) + res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features) + if(is.null(res) ){return(NULL) } + return(res) } - ) - all_res <- map_df(all_coloc, "full_res") - all_obj <- map(all_coloc, "coloc_object") - names(all_obj) <- top_loci$locus - all_res <- arrange(all_res, desc(PP.H4.abf) ) - - outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") - if( !is.null(qtl_dataset_2)){ - outFile <- paste0(outFolder, qtl_dataset, "_", qtl_dataset_2, "_", gwas_dataset, "_COLOC.tsv") - } + ) + } + all_res <- purrr::map_df(all_coloc, "full_res") + all_obj <- purrr::map(all_coloc, "coloc_object") + if(!is.null(targets) ){ + outFile <- outFolder + #outFile <- paste0(outFolder, unique(targets$locus), "_all_pairwise_QTL_COLOC.tsv") + all_res <- dplyr::bind_cols(targets, all_res %>% dplyr::select(-locus) ) + }else{ + outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") + names(all_obj) <- top_loci$locus + } + all_res <- dplyr::arrange(all_res, desc(PP.H4.abf) ) + readr::write_tsv(all_res, path = outFile) # save COLOC objects for plotting - save(all_obj, file = gsub("tsv", "RData", outFile)) + outData <- gsub("tsv.gz", "RData", outFile) + outData <- gsub("tsv", "RData", outFile) + save(all_obj, file = outData) } main() From f4519b4b7b1a84b6079643625e5ff34e6e11b2f3 Mon Sep 17 00:00:00 2001 From: Jack Humphrey Date: Wed, 3 Apr 2024 12:32:17 -0400 Subject: [PATCH 3/7] remove duplicate snp-genes caused by multi-allellic SNPs --- COLOC/scripts/run_COLOC.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 553c8c0..9dfcf96 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -438,6 +438,8 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets }else{ res_subset <- dplyr::select(result, gene, snp, pvalues, MAF, QTL_chr, QTL_pos) } + # edge case - multiallelic SNPs lead to two entries per SNP-gene - remove + res_subset <- dplyr::distinct(res_subset) #print(head(res_subset) ) print("passed this point") #dplyr::filter( pos >= coord_split$start & pos <= coord_split$end) From 62d7a25eddef969bc9ffb56667147bbe7c549951 Mon Sep 17 00:00:00 2001 From: Jack Humphrey Date: Thu, 11 Apr 2024 18:39:54 -0400 Subject: [PATCH 4/7] updates; still working on pairwise QTL coloc --- COLOC/pairwise_qtl_coloc.smk | 78 ++++++++++++++++++++++++++ COLOC/scripts/run_COLOC.R | 104 +++++++++++++++++++++-------------- 2 files changed, 142 insertions(+), 40 deletions(-) create mode 100644 COLOC/pairwise_qtl_coloc.smk diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk new file mode 100644 index 0000000..6fc7eac --- /dev/null +++ b/COLOC/pairwise_qtl_coloc.smk @@ -0,0 +1,78 @@ +import sys +import pandas as pd +# Pairwise QTL COLOC pipeline + +outFolder = config["outFolder"] +inFolder = config["inFolder"] + +target_files = [ x for x in glob.glob(inFolder + "/*genewide_QTL_pairs*tsv.gz" )] + + + +shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;") +rule all: + input: + #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data) + #outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz", + #outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz", + #outFolder + "pairwise_QTL_results.tsv.gz" + #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data ) + #outFolder + "all_COLOC_summary_results.tsv.gz" + +rule run_COLOC: + output: + outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", + outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.RData" + params: + script = "scripts/run_COLOC.R" + shell: + "mkdir -p {outFolder}{wildcards.GWAS};" + "ml R/4.0.3;" #ml arrow;" + "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ " + +rule merge_COLOC_snp_level: + input: + expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data ) + output: + outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz", + outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz" + params: + script = "scripts/merge_COLOC_results.R" + shell: + "ml R/3.6.0;" # LDLINKR only installed on 3.6 + "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" + "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} " + +rule create_LD_tables: + input: + outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.RData" + output: + outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz" + params: + script = "scripts/prepare_gwas_qtls.R", + out = outFolder + "{GWAS}/{QTL}/" + shell: + "conda activate echoR;" + "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" + +rule prepare_QTL_COLOC: + input: + outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz" + output: + outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" + params: + script = "scripts/prepare_pairwise_qtl_coloc.R" + shell: + "ml R/4.2.0;" + "Rscript {params.script} -i {input} -o {output} -m across -p 0.8" + +rule run_QTL_COLOC: + input: + outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" + output: + outFolder + "pairwise_QTL_results.tsv.gz" + params: + script = "scripts/run_COLOC.R" + shell: + "ml R/4.2.0;" + "Rscript {params.script} -t {input} -o {output}" diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 9dfcf96..6b7c217 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -26,6 +26,7 @@ suppressPackageStartupMessages(library(optparse)) suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(rtracklayer)) +options(future.globals.maxSize = 8000 * 1024^2) #library(arrow) @@ -133,7 +134,7 @@ extractLoci <- function(gwas){ # already filtered from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz # all sites with an RSID and 2+ alleles loadMAF <- function(path){ - message(" * loading in MAF from 1000 Genomes") + message(Sys.time(), " * loading in MAF from 1000 Genomes") maf <- data.table::fread(path, nThread = 4) names(maf) <- c("chr", "pos", "snp", "MAF") maf$chr <- as.character(maf$chr) @@ -513,7 +514,8 @@ getQTLinfo <- function(q, hit_info){ ## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value ## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting. ## sig.level now filters out QTLs with P > sig.level -runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL){ +## bare - if TRUE then only return COLOC summary, not full object (saves memory) +runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL, bare = FALSE){ message(" * Analysing LOCUS: ", hit$locus ) # hit is a dataframe containing "snp", "chr", "pos", "locus" @@ -529,20 +531,24 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file hit_snp <- hit$snp # problem when no p column in hit! - - hit_info <- hit %>% - select(locus, GWAS_SNP = snp, GWAS_P = p) - # if GWAS is hg19 then lift over to hg38 - if(gwas$build != qtl$build){ - qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build) + if(is.null(gwas) ){ + hit_info <- hit %>% + select(locus) %>% mutate(GWAS_SNP = NA, GWAS_P = NA) + qtl_range <- hit_range }else{ - qtl_coord <- hit_coord + hit_info <- hit %>% + select(locus, GWAS_SNP = snp, GWAS_P = p) + # if GWAS is hg19 then lift over to hg38 + if(gwas$build != qtl$build){ + qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build) + }else{ + qtl_coord <- hit_coord + } + # record the position of the GWAS top SNP + hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr + hit_info$GWAS_pos <- splitCoords(qtl_coord)$end + qtl_range <- joinCoords(qtl_coord, flank = 1e6) } - # record the position of the GWAS top SNP - hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr - hit_info$GWAS_pos <- splitCoords(qtl_coord)$end - qtl_range <- joinCoords(qtl_coord, flank = 1e6) - message(" * extracting QTLs") q <- extractQTL(qtl, qtl_range, targets = target.file[1]) #print(names(q[[1]])) @@ -606,8 +612,6 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file q_1 = q[[ qtl_combos[.x, "Var1"] ]] q_2 = q2[[ qtl_combos[.x, "Var2"] ]] message( " * ", .x, ": ", unique(q_1$gene), " vs ", unique(q_2$gene) ) - #message( head(q_1) ) - #message( head(q_2) ) if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){ message("Hold up! GWAS and QTL have no SNPs in common") return(NULL) @@ -631,10 +635,11 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file full_res <- full_join(hit_info, full_res, by = "locus") # remove rows with no coloc run full_res <- filter(full_res, !is.na(PP.H4.abf) ) - # add pairwise LD between GWAS and QTL SNP - #full_res <- calc_LD(full_res) - - return(list(full_res = full_res, coloc_object = coloc_res) ) + if(bare == TRUE){ + return(list(full_res = full_res) ) + }else{ + return(list(full_res = full_res, coloc_object = coloc_res) ) + } } # for each COLOC result per locus @@ -674,12 +679,14 @@ option_list <- list( make_option(c('--sig', '-s'), help = "the minimum p-value a QTL should have for testing", default = NULL), make_option(c('--loci', '-l'), help = "text file of loci names", default = NULL), make_option(c('--targets', '-t'), help = "TSV file of features, loci and GWAS for targeted analysis", default = NULL), - make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE) + make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE), + make_option(c('--lowmem', '-b'), help = "do not save COLOC objects, just the summaries", action = "store_true", default = FALSE), + make_option(c('--threads', '-c'), help = "how many threads to use", default = 1) ) option.parser <- OptionParser(option_list=option_list) opt <- parse_args(option.parser) - +print(opt) outFolder <- opt$outFolder gwas_dataset <- opt$gwas @@ -688,9 +695,9 @@ qtl_dataset_2 <- opt$qtl2 debug <- opt$debug sig <- opt$sig loci <- opt$loci -targets <- opt$targets - - +target_file <- opt$targets +low_mem <- opt$lowmem +threads <- opt$threads #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" @@ -704,9 +711,10 @@ if( !exists("maf_1000g")){ } main <- function(){ + message(Sys.time()," * starting COLOC pipeline") if( debug == TRUE){ save.image("debug.RData") } # regular GWAS-QTL COLOC - if( is.null(targets) ){ + if( is.null(target_file) ){ gwas <- pullData(gwas_dataset, "GWAS") qtl <- pullData(qtl_dataset, "QTL") @@ -724,31 +732,40 @@ main <- function(){ }) } # QTL-QTL COLOC with pairwise targets - if(!is.null(targets) ){ - targets <- read_tsv(targets) + if(!is.null(target_file) ){ + targets <- read_tsv(target_file) print( paste0(" * testing ", nrow(targets), " pairs of QTLs" ) ) # for testing - #targets <- head(targets, 3) - all_coloc <- purrr::map( + targets <- head(targets, 100) + if(threads > 1){ + suppressPackageStartupMessages(library(furrr)) + plan(multisession, workers = threads) + map <- future_map + } + all_coloc <- map( 1:nrow(targets), ~{ print( paste0(" * pair ", .x , " of ", nrow(targets) ) ) pair_df <- targets[.x,] - gwas <- pullData(pair_df$GWAS, "GWAS") - locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus) + if( "GWAS" %in% names(pair_df) ){ + gwas <- pullData(pair_df$GWAS, "GWAS") + locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus) + }else{ + locus <- pair_df + gwas <- NULL + } qtl1 <- pullData(pair_df$qtl_1, "QTL") qtl2 <- pullData(pair_df$qtl_2, "QTL") features <- c(pair_df$feature_1, pair_df$feature_2) - res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features) + res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features, bare = low_mem) if(is.null(res) ){return(NULL) } return(res) } ) } - + all_res <- purrr::map_df(all_coloc, "full_res") - all_obj <- purrr::map(all_coloc, "coloc_object") - if(!is.null(targets) ){ - outFile <- outFolder + if(!is.null(target_file) ){ + outFile <- gsub(".tsv.gz", "_COLOC.tsv.gz", target_file) #outFile <- paste0(outFolder, unique(targets$locus), "_all_pairwise_QTL_COLOC.tsv") all_res <- dplyr::bind_cols(targets, all_res %>% dplyr::select(-locus) ) }else{ @@ -756,12 +773,19 @@ main <- function(){ names(all_obj) <- top_loci$locus } all_res <- dplyr::arrange(all_res, desc(PP.H4.abf) ) + message(Sys.time()," * finished COLOC pipeline") readr::write_tsv(all_res, path = outFile) # save COLOC objects for plotting - outData <- gsub("tsv.gz", "RData", outFile) - outData <- gsub("tsv", "RData", outFile) - save(all_obj, file = outData) + if( !low_mem ){ + all_obj <- purrr::map(all_coloc, "coloc_object") + if( is.null(targets) ){ + names(all_obj) <- top_loci$locus + } + outData <- gsub("tsv.gz", "RData", outFile) + outData <- gsub("tsv", "RData", outFile) + save(all_obj, file = outData) + } } main() From 9a29744e3dbd4d89f97c59be94847281520126f5 Mon Sep 17 00:00:00 2001 From: Jack Humphrey Date: Thu, 5 Sep 2024 16:03:07 -0400 Subject: [PATCH 5/7] fixed bug on outfolder --- COLOC/cluster.yaml | 5 +- COLOC/pairwise_qtl_coloc.smk | 74 ++++---------------- COLOC/scripts/run_COLOC.R | 128 +++++++++++++++++++++-------------- GWAS/processGWAS.R | 4 +- 4 files changed, 97 insertions(+), 114 deletions(-) diff --git a/COLOC/cluster.yaml b/COLOC/cluster.yaml index 337150f..e5eddc7 100644 --- a/COLOC/cluster.yaml +++ b/COLOC/cluster.yaml @@ -1,5 +1,4 @@ __default__: - #partition: chimera queue: premium cores: 1 mem: 24000 @@ -8,3 +7,7 @@ __default__: output: logs/{rule}:{wildcards}.stdout error: logs/{rule}:{wildcards}.stderr himem: "" +pairwise_QTL_COLOC: + cores: 4 + mem: 8000 + time: 8640 diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk index 6fc7eac..478e166 100644 --- a/COLOC/pairwise_qtl_coloc.smk +++ b/COLOC/pairwise_qtl_coloc.smk @@ -1,78 +1,30 @@ import sys import pandas as pd -# Pairwise QTL COLOC pipeline +import glob +import re +import os +# Pairwise QTL COLOC pipeline +# Jack Humphrey outFolder = config["outFolder"] inFolder = config["inFolder"] -target_files = [ x for x in glob.glob(inFolder + "/*genewide_QTL_pairs*tsv.gz" )] - +target_files = [ x for x in glob.glob(inFolder + "*genewide_QTL_pairs*tsv.gz" )] +target_ids = [os.path.basename(re.sub('.tsv.gz', '', x, count=1)) for x in target_files ] shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;") rule all: input: - #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data) - #outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz", - #outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz", - #outFolder + "pairwise_QTL_results.tsv.gz" - #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data ) - #outFolder + "all_COLOC_summary_results.tsv.gz" - -rule run_COLOC: - output: - outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", - outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.RData" - params: - script = "scripts/run_COLOC.R" - shell: - "mkdir -p {outFolder}{wildcards.GWAS};" - "ml R/4.0.3;" #ml arrow;" - "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ " - -rule merge_COLOC_snp_level: - input: - expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data ) - output: - outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz", - outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz" - params: - script = "scripts/merge_COLOC_results.R" - shell: - "ml R/3.6.0;" # LDLINKR only installed on 3.6 - "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" - "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} " - -rule create_LD_tables: - input: - outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.RData" - output: - outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz" - params: - script = "scripts/prepare_gwas_qtls.R", - out = outFolder + "{GWAS}/{QTL}/" - shell: - "conda activate echoR;" - "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" - -rule prepare_QTL_COLOC: - input: - outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz" - output: - outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" - params: - script = "scripts/prepare_pairwise_qtl_coloc.R" - shell: - "ml R/4.2.0;" - "Rscript {params.script} -i {input} -o {output} -m across -p 0.8" + expand( outFolder + "{target}_COLOC.tsv.gz", target = target_ids) -rule run_QTL_COLOC: +rule pairwise_QTL_COLOC: input: - outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" + inFolder + "{target}.tsv.gz" output: - outFolder + "pairwise_QTL_results.tsv.gz" + outFolder + "{target}_COLOC.tsv.gz" params: script = "scripts/run_COLOC.R" shell: - "ml R/4.2.0;" - "Rscript {params.script} -t {input} -o {output}" + "ml R;" + "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1" diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 6b7c217..4755024 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -30,6 +30,20 @@ options(future.globals.maxSize = 8000 * 1024^2) #library(arrow) +message_ <- function(text){ + message(Sys.time(), " * ", text) +} +## verbose messaging +message_verbose <- function(input){ + if(verbose){ + if( "data.frame" %in% class(input)){ + print(input) + } + if( class(input) == "character"){ + message(Sys.time(), " * ", input) + } + } +} # flank coordinates by set number of bases (default 1MB) # work on either a coordinate string or a dataframe containing chr start and end columns joinCoords <- function(input, flank = 0){ @@ -45,10 +59,15 @@ joinCoords <- function(input, flank = 0){ stopifnot( all(c("chr", "start", "end") %in% names(input) ) | all(c("chr","pos") %in% names(input) ) ) stopifnot( flank >= 0) if( all( c("chr","start","end") %in% names(input) ) ){ - coords <- paste0(input$chr, ":", input$start - flank, "-",input$end + flank) + # deal with flanks that go negative + starts <- input$start - flank + starts[starts < 0 ] <- 1 + coords <- paste0(input$chr, ":", starts, "-",input$end + flank) } if( all( c("chr", "pos") %in% names(input) ) ){ - coords <- paste0(input$chr, ":", input$pos - (flank + 1), "-", input$pos + flank) + starts <- input$pos - (flank + 1) + starts[starts < 0 ] <- 1 + coords <- paste0(input$chr, ":", starts, "-", input$pos + flank) } return(coords) } @@ -63,10 +82,10 @@ splitCoords <- function(coords){ } pullData <- function(dataset, type = "GWAS"){ - message(Sys.time()," * selected dataset: ", dataset) + message_verbose(paste0("selected dataset: ", dataset)) db_path <- "/sc/arion/projects/ad-omics/data/references/GWAS/GWAS-QTL_data_dictionary.xlsx" - message(Sys.time()," * reading GWAS database from ", db_path) + message_verbose(paste0("reading GWAS database from ", db_path)) stopifnot( file.exists(db_path) ) gwas_db <- suppressMessages(readxl::read_excel(db_path, sheet = type, na= c("", "-","NA"))) @@ -134,7 +153,7 @@ extractLoci <- function(gwas){ # already filtered from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz # all sites with an RSID and 2+ alleles loadMAF <- function(path){ - message(Sys.time(), " * loading in MAF from 1000 Genomes") + message_("loading in MAF from 1000 Genomes") maf <- data.table::fread(path, nThread = 4) names(maf) <- c("chr", "pos", "snp", "MAF") maf$chr <- as.character(maf$chr) @@ -155,12 +174,12 @@ matchMAF <- function(data, maf){ # subset maf by chromosome maf <- maf[chr] # match on RSID - message(" * before joining MAF: ", nrow(data), " SNPs") + message_verbose(paste0("before joining MAF: ", nrow(data), " SNPs")) data$MAF <- maf$MAF[match(data$snp, maf$snp)] matched <- data[ !is.na(data$MAF) & data$MAF >0 & data$MAF < 1,] - message(" * after joining MAF: ", nrow(matched), " SNPs") + message_verbose(paste0("after joining MAF: ", nrow(matched), " SNPs")) if(nrow(matched) == 0 ){ - print(head(data) ) + message_verbose(head(data) ) stop("no SNPs match on RSID. Inspect data") } return(matched) @@ -201,10 +220,10 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da } cmd <- paste( "ml bcftools; tabix -h ", gwas_path, coord ) - message(" * running command: ", cmd) + message_verbose(paste0("running command: ", cmd) ) result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) ) - message(" * GWAS extracted!") + message_verbose("GWAS extracted!") stopifnot( nrow(result) > 0 ) @@ -225,11 +244,11 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da # deal with MAF if missing if( is.na(mafCol) | force_maf == TRUE ){ - message(" * MAF not present - using 1000 Genomes MAF") + message_verbose("MAF not present - using 1000 Genomes MAF") # how to get this in to the function? it can't find maf_1000g result <- matchMAF(result, maf = maf_1000g) }else{ - message(" * using supplied MAF") + message_verbose("using supplied MAF") names(result)[names(col_dict) == mafCol] <- "MAF" } # convert standard error to the variance @@ -276,9 +295,9 @@ liftOverCoord <- function(coord_string, from = "hg19", to = "hg38"){ ) # lift over using rtracklayer # watch out for duplicate entries - message(" * lifting over....") + message_verbose("lifting over....") lifted_over <- rtracklayer::liftOver(coord_gr, chain ) - message(" * lifted!" ) + message_verbose("lifted!" ) #return(lifted_over) stopifnot(length(lifted_over) == length(coord_gr) ) @@ -308,7 +327,7 @@ extractQTL_parquet <- function(qtl, coord, sig_level = 0.05){ perm_res <- dplyr::bind_cols(perm_res, splitCoords(perm_res$variant_id) ) sig <- dplyr::filter(perm_res, qval < sig_level & chr == coord_split$chr & start >= coord_split$start & start <= coord_split$end ) - message( paste0( nrow(sig), " significant genes or splicing events at this locus" ) ) + message_verbose( paste0( nrow(sig), " significant genes or splicing events at this locus" ) ) if( nrow(sig) == 0 ){ return(NULL) } # read in nominal QTL associations @@ -333,14 +352,14 @@ extractQTL_tabix <- function(qtl, coord){ } stopifnot( file.exists(qtl$full_path) ) cmd <- paste( "ml bcftools; tabix -h ", qtl$full_path, coord ) - message(" * running command: ", cmd) + message_verbose(paste0("running command: ", cmd)) result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) ) # deal with regions of no QTL association if( nrow(result) == 0){ return(NULL) } #stopifnot( nrow(result) > 0 ) - message(" * QTL extracted!") + message_verbose("QTL extracted!") return(result) } @@ -406,13 +425,13 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets # this requires "chr" to be present in QTL data if( debug == TRUE){ result$MAF <- NA }else{ if( is.na(mafCol) | force_maf == TRUE ){ - message(" * MAF not present - using 1000 Genomes MAF") + message_verbose("MAF not present - using 1000 Genomes MAF") #stopifnot( "chr" %in% names(col_dict) ) names(result)[names(col_dict) == "chr"] <- "chr" result <- matchMAF(result, maf = maf_1000g) #print(head(result) ) }else{ - message(" * using supplied MAF") + message_verbose("using supplied MAF") names(result)[names(col_dict) == mafCol] <- "MAF" #print(col_dict) #print(head(result) ) @@ -420,7 +439,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets } # deal with edge cases - 1 gene and the gene id is NA if( all( is.na(result$gene) ) ){ - message(" * no genes in QTL result") + message_verbose("no genes in QTL result") return(NULL) }else{ result <- dplyr::filter(result, !is.na(gene) ) @@ -442,12 +461,14 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets # edge case - multiallelic SNPs lead to two entries per SNP-gene - remove res_subset <- dplyr::distinct(res_subset) #print(head(res_subset) ) - print("passed this point") + message_verbose("passed this point") + #save(list = ls(all.names = TRUE), file = "image.RData", envir = + #environment()) #dplyr::filter( pos >= coord_split$start & pos <= coord_split$end) if( !is.null(targets) ){ res_subset <- dplyr::filter(res_subset, gene == targets) - print(paste0(length(unique(res_subset$gene)), " target features remain" ) ) + message_verbose(paste0(length(unique(res_subset$gene)), " target features remain" ) ) if( nrow(res_subset) == 0 ){ return(NULL) } @@ -490,7 +511,7 @@ getQTLinfo <- function(q, hit_info){ } }) qtl_info <- bind_cols(qtl_info, gwas_info) - print(head(qtl_info) ) + message_verbose(head(qtl_info) ) }else{ qtl_info <- q %>% map_df( ~{ @@ -516,7 +537,7 @@ getQTLinfo <- function(q, hit_info){ ## sig.level now filters out QTLs with P > sig.level ## bare - if TRUE then only return COLOC summary, not full object (saves memory) runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL, bare = FALSE){ - message(" * Analysing LOCUS: ", hit$locus ) + message_(paste0("Analysing locus: ", hit$locus )) # hit is a dataframe containing "snp", "chr", "pos", "locus" hit_coord <- joinCoords(hit, flank = 0) @@ -524,7 +545,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file # extract GWAS and QTL summary for given coord if( is.null(qtl2) ){ - message(" * extracting GWAS") + message_verbose("extracting GWAS") g <- extractGWAS(gwas, hit_range) } # get hit info from GWAS @@ -549,7 +570,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file hit_info$GWAS_pos <- splitCoords(qtl_coord)$end qtl_range <- joinCoords(qtl_coord, flank = 1e6) } - message(" * extracting QTLs") + message_verbose("extracting QTLs") q <- extractQTL(qtl, qtl_range, targets = target.file[1]) #print(names(q[[1]])) if( is.null(q) ){ return(NULL) } @@ -558,7 +579,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file if( !is.null(sig.level ) ){ qtl_info <- filter(qtl_info, QTL_P < sig.level) q <- q[ qtl_info$gene ] - message(" * ", length(q), " features in QTL") + message_verbose(paste0(length(q), " features in QTL")) } if( !is.null(qtl2) ){ @@ -570,14 +591,14 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file qtl2_info <- filter(qtl2_info, QTL_P < sig.level) q2 <- q2[ qtl2_info$gene ] } - message(" * ", length(q2), " features in QTL 2") + message_verbose( paste0(length(q2), " features in QTL 2")) } # actually run COLOC - message(" * running COLOC") + message_("running COLOC") # return the results table and an object containing g and q # GWAS-QTL COLOC if( is.null(qtl2) ){ - message(" * GWAS-QTL COLOC") + message_("GWAS-QTL COLOC") coloc_res <- purrr::map( q, ~{ if( length(intersect(g$snp, .x$snp) ) == 0 ){ @@ -592,11 +613,11 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file return( list(df = coloc_df, object = coloc_object) ) }) }else{ - message(" * QTL-QTL COLOC") + message_("QTL-QTL COLOC") # remove features with lead SNPs > 1e-5 # test all pairs of QTL features in the locus qtl_combos <- expand.grid(1:length(q), 1:length(q2) ) - message(" * testing ", nrow(qtl_combos), " pairs of QTLs") + message_(paste0("testing ", nrow(qtl_combos), " pairs of QTLs")) # qtl_info is matched on to COLOC results using gene names - but for QTL-QTL there are two genes, so instead use the index of the pair qtl_info <- full_join( @@ -611,9 +632,9 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file purrr::map( 1:nrow(qtl_combos), ~{ q_1 = q[[ qtl_combos[.x, "Var1"] ]] q_2 = q2[[ qtl_combos[.x, "Var2"] ]] - message( " * ", .x, ": ", unique(q_1$gene), " vs ", unique(q_2$gene) ) + message_( paste0( .x, ": ", unique(q_1$gene), " vs ", unique(q_2$gene) ) ) if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){ - message("Hold up! GWAS and QTL have no SNPs in common") + message_("Hold up! GWAS and QTL have no SNPs in common") return(NULL) } coloc_object <- coloc.abf(dataset1 = q_1, dataset2 = q_2) @@ -625,11 +646,11 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file } if( is.null(coloc_res) ){ return(NULL) } - message(" * COLOC finished") + message_("COLOC finished") coloc_df <- map_df(coloc_res, "df", .id = "gene") %>% dplyr::mutate(locus = hit$locus ) %>% dplyr::select(locus, gene, everything() ) - message(" * COLOC res created") + message_("COLOC res created") # bind coloc_res to other tables full_res <- full_join(qtl_info, coloc_df, by = "gene") full_res <- full_join(hit_info, full_res, by = "locus") @@ -649,7 +670,7 @@ calc_LD <- function( coloc_res ){ # get LDlink token from .Renviron (only Jack has this, for access go to https://ldlink.nci.nih.gov/?tab=apiaccess) token <- Sys.getenv("LDLINK_TOKEN") if( token == ""){ - warning(" * LDlink token not found!" ) + warning(Sys.time(), " * LDlink token not found!" ) return(NA) } snps <- unique( c( unique(coloc_res$GWAS_SNP), unique( coloc_res$QTL_SNP) ) ) @@ -681,12 +702,12 @@ option_list <- list( make_option(c('--targets', '-t'), help = "TSV file of features, loci and GWAS for targeted analysis", default = NULL), make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE), make_option(c('--lowmem', '-b'), help = "do not save COLOC objects, just the summaries", action = "store_true", default = FALSE), - make_option(c('--threads', '-c'), help = "how many threads to use", default = 1) + make_option(c('--threads', '-c'), help = "how many threads to use", default = 1), + make_option(c('--verbose', '-v'), help = "if selected then output more information", action = "store_true", default = FALSE) ) option.parser <- OptionParser(option_list=option_list) opt <- parse_args(option.parser) -print(opt) outFolder <- opt$outFolder gwas_dataset <- opt$gwas @@ -698,9 +719,11 @@ loci <- opt$loci target_file <- opt$targets low_mem <- opt$lowmem threads <- opt$threads +verbose <- opt$verbose #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" +if(verbose){print(opt)} # load in MAF table if( !exists("maf_1000g")){ @@ -711,10 +734,11 @@ if( !exists("maf_1000g")){ } main <- function(){ - message(Sys.time()," * starting COLOC pipeline") + message_("starting COLOC pipeline") if( debug == TRUE){ save.image("debug.RData") } # regular GWAS-QTL COLOC if( is.null(target_file) ){ + message_("GWAS - QTL COLOC") gwas <- pullData(gwas_dataset, "GWAS") qtl <- pullData(qtl_dataset, "QTL") @@ -722,7 +746,7 @@ main <- function(){ qtl2 <- pullData(qtl_dataset_2, "QTL") }else{ qtl2 <- NULL} top_loci <- extractLoci(gwas) - print(paste0(" testing ", nrow(top_loci), " loci ") ) + message_(paste0(" testing ", nrow(top_loci), " loci ") ) all_coloc <- purrr::map( 1:nrow(top_loci), ~{ @@ -733,10 +757,13 @@ main <- function(){ } # QTL-QTL COLOC with pairwise targets if(!is.null(target_file) ){ + message_("QTL-QTL COLOC") targets <- read_tsv(target_file) - print( paste0(" * testing ", nrow(targets), " pairs of QTLs" ) ) - # for testing - targets <- head(targets, 100) + # for TESTING + #targets <- head(targets, 3) + message_( paste0("testing ", nrow(targets), " pairs of QTLs" ) ) + # doesn't work properly - furrr loads the whole MAF object into memory for each parallel run, 100MB per run which is crazy. + # for parallel to work I would have to transition to querying the MAF database each time like we do with the sumstats. if(threads > 1){ suppressPackageStartupMessages(library(furrr)) plan(multisession, workers = threads) @@ -744,12 +771,13 @@ main <- function(){ } all_coloc <- map( 1:nrow(targets), ~{ - print( paste0(" * pair ", .x , " of ", nrow(targets) ) ) + message_verbose( paste0("Pair ", .x , " of ", nrow(targets) ) ) pair_df <- targets[.x,] if( "GWAS" %in% names(pair_df) ){ gwas <- pullData(pair_df$GWAS, "GWAS") locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus) }else{ + message_verbose("no GWAS specified - using chr and pos from targets file") locus <- pair_df gwas <- NULL } @@ -757,6 +785,7 @@ main <- function(){ qtl2 <- pullData(pair_df$qtl_2, "QTL") features <- c(pair_df$feature_1, pair_df$feature_2) res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features, bare = low_mem) + gc() if(is.null(res) ){return(NULL) } return(res) } @@ -765,16 +794,15 @@ main <- function(){ all_res <- purrr::map_df(all_coloc, "full_res") if(!is.null(target_file) ){ - outFile <- gsub(".tsv.gz", "_COLOC.tsv.gz", target_file) - #outFile <- paste0(outFolder, unique(targets$locus), "_all_pairwise_QTL_COLOC.tsv") - all_res <- dplyr::bind_cols(targets, all_res %>% dplyr::select(-locus) ) + outFile <- file.path(outFolder, gsub(".tsv.gz", "_COLOC.tsv.gz", basename(target_file) ) ) + all_res <- dplyr::bind_cols(targets, all_res %>% select(-locus) ) }else{ outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") names(all_obj) <- top_loci$locus } all_res <- dplyr::arrange(all_res, desc(PP.H4.abf) ) - message(Sys.time()," * finished COLOC pipeline") - + message("finished COLOC pipeline") + message_verbose(paste0("writing output to ", outFile)) readr::write_tsv(all_res, path = outFile) # save COLOC objects for plotting if( !low_mem ){ diff --git a/GWAS/processGWAS.R b/GWAS/processGWAS.R index 12a1835..97fc82a 100644 --- a/GWAS/processGWAS.R +++ b/GWAS/processGWAS.R @@ -81,7 +81,7 @@ get_standard_error <- function(OR, P){ # write out sorted file to reference folder # tabix index the file # Ripke has loci with chr23 for X - remove -sortGWAS <- function(gwas, out_folder = "./"){ +sortGWAS <- function(gwas, out_folder = ""){ dataset <- gwas$dataset col_chr <- gwas$full_chrom @@ -127,7 +127,7 @@ sortGWAS <- function(gwas, out_folder = "./"){ # remove duplicate rows gwas_sorted <- dplyr::distinct(gwas_sorted) - out_path <- paste0(out_folder, dataset, ".processed.tsv") + out_path <- file.path(out_folder, paste0(dataset, ".processed.tsv") ) message(Sys.time()," * writing sorted GWAS file to ", out_path) readr::write_tsv(gwas_sorted, out_path) stopifnot(file.exists(out_path) ) From 432626b41584d23eeb49e5427627caf96162b825 Mon Sep 17 00:00:00 2001 From: Jack Humphrey Date: Mon, 24 Feb 2025 15:42:46 -0500 Subject: [PATCH 6/7] updates - allow custom MAF, fix bugs --- COLOC/pairwise_qtl_coloc.smk | 5 +-- COLOC/scripts/run_COLOC.R | 69 +++++++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk index 478e166..a79653f 100644 --- a/COLOC/pairwise_qtl_coloc.smk +++ b/COLOC/pairwise_qtl_coloc.smk @@ -8,8 +8,9 @@ import os # Jack Humphrey outFolder = config["outFolder"] inFolder = config["inFolder"] +MAF_file = config["MAF_file"] -target_files = [ x for x in glob.glob(inFolder + "*genewide_QTL_pairs*tsv.gz" )] +target_files = [ x for x in glob.glob(inFolder + "*genewide*pairs*tsv.gz" )] target_ids = [os.path.basename(re.sub('.tsv.gz', '', x, count=1)) for x in target_files ] @@ -27,4 +28,4 @@ rule pairwise_QTL_COLOC: script = "scripts/run_COLOC.R" shell: "ml R;" - "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1" + "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1 --MAF {MAF_file}" diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 4755024..1904335 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -153,9 +153,9 @@ extractLoci <- function(gwas){ # already filtered from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz # all sites with an RSID and 2+ alleles loadMAF <- function(path){ - message_("loading in MAF from 1000 Genomes") + #message_("loading in MAF from 1000 Genomes") maf <- data.table::fread(path, nThread = 4) - names(maf) <- c("chr", "pos", "snp", "MAF") + names(maf)[1:4] <- c("chr", "pos", "snp", "MAF") maf$chr <- as.character(maf$chr) data.table::setkey(maf, "chr") maf$MAF <- suppressWarnings(as.numeric(maf$MAF)) @@ -179,8 +179,9 @@ matchMAF <- function(data, maf){ matched <- data[ !is.na(data$MAF) & data$MAF >0 & data$MAF < 1,] message_verbose(paste0("after joining MAF: ", nrow(matched), " SNPs")) if(nrow(matched) == 0 ){ - message_verbose(head(data) ) - stop("no SNPs match on RSID. Inspect data") + #message_verbose(head(data) ) + message_verbose("no SNPs match on RSID. Inspect data") + return(NULL) } return(matched) } @@ -219,7 +220,7 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da } } - cmd <- paste( "ml bcftools; tabix -h ", gwas_path, coord ) + cmd <- paste( "ml bcftools/1.9; tabix -h ", gwas_path, coord ) message_verbose(paste0("running command: ", cmd) ) result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) ) @@ -351,7 +352,7 @@ extractQTL_tabix <- function(qtl, coord){ } } stopifnot( file.exists(qtl$full_path) ) - cmd <- paste( "ml bcftools; tabix -h ", qtl$full_path, coord ) + cmd <- paste( "ml bcftools/1.9; tabix -h ", qtl$full_path, coord ) message_verbose(paste0("running command: ", cmd)) result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) ) # deal with regions of no QTL association @@ -411,9 +412,11 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets names(result)[names(col_dict) == seCol] <- "varbeta" # don't forget to square the standard error to get the variance result$varbeta <- result$varbeta^2 - + message_verbose(" remove associations with 0 variance") + message_verbose(paste0(" before: ", nrow(result), " associations")) # remove rows with variance of 0 - this will corrupt COLOC result <- dplyr::filter(result, varbeta != 0) + message_verbose(paste0(" after: ", nrow(result), " associations")) } names(result)[names(col_dict) == snpCol] <- "snp" # Young microglia use log10P - convert @@ -423,12 +426,13 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets # deal with MAF - meta-analysis outputs won't have it # this requires "chr" to be present in QTL data - if( debug == TRUE){ result$MAF <- NA }else{ + #if( debug == TRUE){ result$MAF <- NA }else{ # why do we do this? breaks it when debug is TRUE if( is.na(mafCol) | force_maf == TRUE ){ message_verbose("MAF not present - using 1000 Genomes MAF") #stopifnot( "chr" %in% names(col_dict) ) names(result)[names(col_dict) == "chr"] <- "chr" result <- matchMAF(result, maf = maf_1000g) + if( all(is.na(result$MAF) ) | is.null(result) ){ return(NULL) } #print(head(result) ) }else{ message_verbose("using supplied MAF") @@ -436,7 +440,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets #print(col_dict) #print(head(result) ) } - } + #} # deal with edge cases - 1 gene and the gene id is NA if( all( is.na(result$gene) ) ){ message_verbose("no genes in QTL result") @@ -458,8 +462,14 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets }else{ res_subset <- dplyr::select(result, gene, snp, pvalues, MAF, QTL_chr, QTL_pos) } + # jan 2025: another edge case - NA pvalues + message_verbose("removing NA P-values") + res_subset <- filter(res_subset, !is.na(pvalues) ) + message_verbose(paste0("after: ", nrow(res_subset) )) # edge case - multiallelic SNPs lead to two entries per SNP-gene - remove + message_verbose("removing multi-allelic SNPs") res_subset <- dplyr::distinct(res_subset) + message_verbose(paste0("after: ", nrow(res_subset) )) #print(head(res_subset) ) message_verbose("passed this point") #save(list = ls(all.names = TRUE), file = "image.RData", envir = @@ -469,6 +479,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets if( !is.null(targets) ){ res_subset <- dplyr::filter(res_subset, gene == targets) message_verbose(paste0(length(unique(res_subset$gene)), " target features remain" ) ) + message_verbose(paste0(nrow(res_subset), " associations remain")) if( nrow(res_subset) == 0 ){ return(NULL) } @@ -586,7 +597,12 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file # if second QTL dataset requested q2 <- extractQTL(qtl2, qtl_range, targets = target.file[2]) if( is.null(q2) ){ return(NULL) } - qtl2_info <- getQTLinfo(q2, hit_info) + # get QTL betas for lead SNP in QTL 1 + hit_info_qtl1 <- data.frame(locus = hit_info$locus, GWAS_SNP = qtl_info$QTL_SNP, GWAS_P = qtl_info$QTL_P) + qtl2_info <- getQTLinfo(q2, hit_info_qtl1) + # get QTL betas for lead SNP in QTL 2 + hit_info_qtl2 <- data.frame(locus = hit_info$locus, GWAS_SNP = qtl2_info$QTL_SNP, GWAS_P = qtl2_info$QTL_P) + qtl_info <- getQTLinfo(q, hit_info_qtl2) if( !is.null(sig.level ) ){ qtl2_info <- filter(qtl2_info, QTL_P < sig.level) q2 <- q2[ qtl2_info$gene ] @@ -644,7 +660,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file return( list(df = coloc_df, object = coloc_object) ) }) } - if( is.null(coloc_res) ){ return(NULL) } + if( is.null(unlist(coloc_res)) ){ return(NULL) } message_("COLOC finished") coloc_df <- map_df(coloc_res, "df", .id = "gene") %>% @@ -693,7 +709,7 @@ calc_LD <- function( coloc_res ){ option_list <- list( - make_option(c('-o', '--outFolder'), help='the path to the output file', default = ""), + make_option(c('-o', '--outFolder'), help='the path to the output file', default = "."), make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database", default = NULL ), make_option(c('--qtl', '-q'), help = "the dataset ID for a QTL dataset in the GWAS/QTL database"), make_option(c('--qtl2', '-r'), help = "a second QTL dataset - using this will trigger QTL-QTL COLOC", default = NULL), @@ -703,6 +719,7 @@ option_list <- list( make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE), make_option(c('--lowmem', '-b'), help = "do not save COLOC objects, just the summaries", action = "store_true", default = FALSE), make_option(c('--threads', '-c'), help = "how many threads to use", default = 1), + make_option(c('--MAF', '-m'), help = "path to custom MAF file", default = NULL), make_option(c('--verbose', '-v'), help = "if selected then output more information", action = "store_true", default = FALSE) ) @@ -720,15 +737,22 @@ target_file <- opt$targets low_mem <- opt$lowmem threads <- opt$threads verbose <- opt$verbose -#maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" -maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" +MAF <- opt$MAF + +# default MAF is 1000g phase 3v5 +# but users can provide their own +# must have columns chr, pos, snp, MAF +if( is.null(MAF) ){ + #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" + MAF <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" +} if(verbose){print(opt)} # load in MAF table -if( !exists("maf_1000g")){ - - maf_1000g <- loadMAF(maf_1000gp3) +if( !exists("maf_1000g") ){ + message(paste0("loading MAF from ", MAF ) ) + maf_1000g <- loadMAF(MAF) # load in liftover chain chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain") } @@ -794,16 +818,19 @@ main <- function(){ all_res <- purrr::map_df(all_coloc, "full_res") if(!is.null(target_file) ){ - outFile <- file.path(outFolder, gsub(".tsv.gz", "_COLOC.tsv.gz", basename(target_file) ) ) - all_res <- dplyr::bind_cols(targets, all_res %>% select(-locus) ) + outFile <- file.path(outFolder, gsub(".tsv", "_COLOC.tsv", basename(target_file) ) ) + # here if any of the COLOCs did not proceed then the target df will be larger than the all_res results + + #all_res <- dplyr::bind_cols(targets, all_res %>% select(-locus) ) }else{ - outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") + outFile <- paste0(outFolder,"/", qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") names(all_obj) <- top_loci$locus } + dir.create(outFolder, showWarnings = FALSE) all_res <- dplyr::arrange(all_res, desc(PP.H4.abf) ) message("finished COLOC pipeline") message_verbose(paste0("writing output to ", outFile)) - readr::write_tsv(all_res, path = outFile) + readr::write_tsv(all_res, file = outFile) # save COLOC objects for plotting if( !low_mem ){ all_obj <- purrr::map(all_coloc, "coloc_object") From 483138b452c777112ed7cb768e45ec82bb38d358 Mon Sep 17 00:00:00 2001 From: Jack Humphrey Date: Tue, 31 Mar 2026 19:31:03 -0400 Subject: [PATCH 7/7] tweaks to qvalue; adding significance filter to COLOC with -s option --- COLOC/Snakefile | 10 +++++----- COLOC/cluster.yaml | 4 ++-- COLOC/scripts/run_COLOC.R | 24 +++++++++++++++++++----- qvalue_sharing/Snakefile | 4 ++-- qvalue_sharing/run_qvalue.R | 6 +++--- 5 files changed, 31 insertions(+), 17 deletions(-) diff --git a/COLOC/Snakefile b/COLOC/Snakefile index 74284c9..b6387fe 100644 --- a/COLOC/Snakefile +++ b/COLOC/Snakefile @@ -5,7 +5,7 @@ gwas_data = config["gwas"] qtl_data = config["qtl"] outFolder = config["outFolder"] geneMeta = config["geneMeta"] - +sig_threshold = config["sig_threshold"] # sanity check - can the GWAS and QTL data be found in the database? gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1) qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2) @@ -29,7 +29,7 @@ rule all: #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data) outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz", outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz", - outFolder + "pairwise_QTL_results.tsv.gz" + #outFolder + "pairwise_QTL_results.tsv.gz" #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data ) #outFolder + "all_COLOC_summary_results.tsv.gz" @@ -41,8 +41,8 @@ rule run_COLOC: script = "scripts/run_COLOC.R" shell: "mkdir -p {outFolder}{wildcards.GWAS};" - "ml R/4.0.3;" #ml arrow;" - "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ " + "ml R;" #ml arrow;" + "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ -s 1e-5 --verbose --lowmem" rule merge_COLOC_snp_level: input: @@ -53,7 +53,7 @@ rule merge_COLOC_snp_level: params: script = "scripts/merge_COLOC_results.R" shell: - "ml R/3.6.0;" # LDLINKR only installed on 3.6 + "ml R;" # LDLINKR only installed on 3.6 "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} " diff --git a/COLOC/cluster.yaml b/COLOC/cluster.yaml index e5eddc7..bccd9de 100644 --- a/COLOC/cluster.yaml +++ b/COLOC/cluster.yaml @@ -1,7 +1,7 @@ __default__: queue: premium cores: 1 - mem: 24000 + mem: 48000 time: '120' name: $(basename $(pwd)):{rule}:{wildcards} output: logs/{rule}:{wildcards}.stdout @@ -9,5 +9,5 @@ __default__: himem: "" pairwise_QTL_COLOC: cores: 4 - mem: 8000 + mem: 16000 time: 8640 diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 1904335..1758c26 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -367,8 +367,8 @@ extractQTL_tabix <- function(qtl, coord){ # extract all nominal QTL P-values overlapping the flanked GWAS hit # split by Gene being tested -# sig_level doesn't do anything yet -extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets = NULL){ +# now implemented sig_level filtering for minimum P-value for each phenotype +extractQTL <- function(qtl, coord, sig_level = NULL, force_maf = FALSE, targets = NULL){ # variables stored in qtl # if qtl type is parquet then read in parquet files # if qtl type is tabix then query the coordinates through tabix @@ -423,7 +423,18 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE, targets if( pvalCol == "log10_p"){ result$pvalues <- 10^result$pvalues } - + # P-value thresholding - methylation QTLs, there are way too many sites to test all of them + if( !is.null(sig_level) ){ + message_verbose(paste0(" currently: ", length(unique(result$gene) ), " phenotypes in window" ) ) + message_verbose(paste0(" removing phenotypes with P < ", sig_level)) + pheno_p <- result %>% dplyr::group_by(gene) %>% dplyr::slice_min(pvalues, n = 1, with_ties = FALSE) %>% dplyr::filter(pvalues < as.numeric(sig_level) ) + result <- dplyr::filter(result, gene %in% pheno_p$gene) + if(nrow(result) == 0){ + message_verbose(" 0 phenotypes remaining after P thresholding") + return(NULL) + } + message_verbose(paste0("retaining ", nrow(result), " associations from ", nrow(pheno_p), " phenotypes" )) + } # deal with MAF - meta-analysis outputs won't have it # this requires "chr" to be present in QTL data #if( debug == TRUE){ result$MAF <- NA }else{ # why do we do this? breaks it when debug is TRUE @@ -582,7 +593,7 @@ runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file qtl_range <- joinCoords(qtl_coord, flank = 1e6) } message_verbose("extracting QTLs") - q <- extractQTL(qtl, qtl_range, targets = target.file[1]) + q <- extractQTL(qtl, qtl_range, targets = target.file[1], sig_level = sig.level) #print(names(q[[1]])) if( is.null(q) ){ return(NULL) } qtl_info <- getQTLinfo(q, hit_info) @@ -827,7 +838,10 @@ main <- function(){ names(all_obj) <- top_loci$locus } dir.create(outFolder, showWarnings = FALSE) - all_res <- dplyr::arrange(all_res, desc(PP.H4.abf) ) + # in case all COLOCs are null for some reason + if(nrow(all_res) > 0){ + all_res <- dplyr::arrange(all_res, desc(PP.H4.abf) ) + } message("finished COLOC pipeline") message_verbose(paste0("writing output to ", outFile)) readr::write_tsv(all_res, file = outFile) diff --git a/qvalue_sharing/Snakefile b/qvalue_sharing/Snakefile index f6c9c85..4b086d5 100644 --- a/qvalue_sharing/Snakefile +++ b/qvalue_sharing/Snakefile @@ -18,7 +18,7 @@ rule run_qvalue: output: outFolder + "{source}:{target}:{threshold}.qvalue.tsv" shell: - "ml R/3.6.0;" + "ml R;" "Rscript {params.script} -s {params.source} -t {params.target} -q {threshold} -o {outFolder}" rule merge_qvalue: @@ -29,5 +29,5 @@ rule merge_qvalue: output: outFolder + "all_qvalue_merged." + threshold + ".tsv" shell: - "ml R/3.6.0;" + "ml R;" "Rscript {params.script} -o {outFolder} -q {threshold}" diff --git a/qvalue_sharing/run_qvalue.R b/qvalue_sharing/run_qvalue.R index 50a372b..38c6511 100644 --- a/qvalue_sharing/run_qvalue.R +++ b/qvalue_sharing/run_qvalue.R @@ -11,10 +11,10 @@ pullData <- function(dataset, type = "GWAS"){ message(Sys.time()," * reading GWAS database from ", db_path) stopifnot( file.exists(db_path) ) if( type == "GWAS"){ - n_sheet <- 3 + n_sheet <- 2 } if( type == "QTL"){ - n_sheet <- 2 + n_sheet <- 3 } gwas_db <- suppressMessages(readxl::read_excel(db_path, sheet = n_sheet,na= c("", "-","NA"))) @@ -78,7 +78,7 @@ extractTargetQTL <- function(qtl, chr, source_qtls){ stopifnot( file.exists(qtl$full_path) ) # read in QTL - cmd <- paste0("ml bcftools; tabix ", qtl$full_path, " ", chr) + cmd <- paste0("ml tabix; tabix ", qtl$full_path, " ", chr) message( " * ", cmd) result <- data.table::fread( cmd = cmd, nThread = 8 ) if( nrow(result) == 0){