Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 37 additions & 8 deletions COLOC/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,31 @@ gwas_data = config["gwas"]
qtl_data = config["qtl"]
outFolder = config["outFolder"]
geneMeta = config["geneMeta"]

sig_threshold = config["sig_threshold"]
# sanity check - can the GWAS and QTL data be found in the database?
gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2)
qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1)
gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1)
qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2)

gwas_check = all(i in gwas_df["dataset"].tolist() for i in gwas_data )
qtl_check = all(i in qtl_df["dataset"].tolist() for i in qtl_data )

if not all([gwas_check, qtl_check]):
print(" * QTL and/or GWAS cannot be found in database")
if not all([gwas_check]):
print(" * GWAS cannot be found in database")
sys.exit()

if not all([qtl_check]):
print(" * QTL cannot be found in database")
print(qtl_check)
sys.exit()


shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;")
rule all:
input:
#expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data)
outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz",
outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz",
#outFolder + "pairwise_QTL_results.tsv.gz"
#expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data )
#outFolder + "all_COLOC_summary_results.tsv.gz"

Expand All @@ -34,8 +41,8 @@ rule run_COLOC:
script = "scripts/run_COLOC.R"
shell:
"mkdir -p {outFolder}{wildcards.GWAS};"
"ml R/4.0.3;" #ml arrow;"
"Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ "
"ml R;" #ml arrow;"
"Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ -s 1e-5 --verbose --lowmem"

rule merge_COLOC_snp_level:
input:
Expand All @@ -46,7 +53,7 @@ rule merge_COLOC_snp_level:
params:
script = "scripts/merge_COLOC_results.R"
shell:
"ml R/4.0.3;"
"ml R;" # LDLINKR only installed on 3.6
"Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};"
"Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} "

Expand All @@ -61,3 +68,25 @@ rule create_LD_tables:
shell:
"conda activate echoR;"
"Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}"

rule prepare_QTL_COLOC:
input:
outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"
output:
outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
params:
script = "scripts/prepare_pairwise_qtl_coloc.R"
shell:
"ml R/4.2.0;"
"Rscript {params.script} -i {input} -o {output} -m across -p 0.8"

rule run_QTL_COLOC:
input:
outFolder + "pairwise_QTL_COLOC_targets.tsv.gz"
output:
outFolder + "pairwise_QTL_results.tsv.gz"
params:
script = "scripts/run_COLOC.R"
shell:
"ml R/4.2.0;"
"Rscript {params.script} -t {input} -o {output}"
7 changes: 5 additions & 2 deletions COLOC/cluster.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
__default__:
#partition: chimera
queue: premium
cores: 1
mem: 24000
mem: 48000
time: '120'
name: $(basename $(pwd)):{rule}:{wildcards}
output: logs/{rule}:{wildcards}.stdout
error: logs/{rule}:{wildcards}.stderr
himem: ""
pairwise_QTL_COLOC:
cores: 4
mem: 16000
time: 8640
31 changes: 31 additions & 0 deletions COLOC/pairwise_qtl_coloc.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import sys
import pandas as pd
import glob
import re
import os

# Pairwise QTL COLOC pipeline
# Jack Humphrey
outFolder = config["outFolder"]
inFolder = config["inFolder"]
MAF_file = config["MAF_file"]

target_files = [ x for x in glob.glob(inFolder + "*genewide*pairs*tsv.gz" )]
target_ids = [os.path.basename(re.sub('.tsv.gz', '', x, count=1)) for x in target_files ]


shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;")
rule all:
input:
expand( outFolder + "{target}_COLOC.tsv.gz", target = target_ids)

rule pairwise_QTL_COLOC:
input:
inFolder + "{target}.tsv.gz"
output:
outFolder + "{target}_COLOC.tsv.gz"
params:
script = "scripts/run_COLOC.R"
shell:
"ml R;"
"Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1 --MAF {MAF_file}"
73 changes: 73 additions & 0 deletions COLOC/scripts/prepare_pairwise_qtl_coloc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
library(optparse)

option_list <- list(
make_option(c('-o', '--outFile'), help='the path to the output file', default = ""),
make_option(c('-i', '--inFile'), help='the path to the input file', default = ""),
make_option(c('-m', '--mode'), help='mode - across or within?', default = "across"),
make_option(c('-p', '--pp'), help='min PP4', default = 0.5),
make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE)
)

option.parser <- OptionParser(option_list=option_list)
opt <- parse_args(option.parser)

inFile <- opt$inFile
outFile <- opt$outFile
mode <- opt$mode
PP4 <- opt$pp
library(tidyverse)

#all_coloc <- read_tsv("../all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz")
all_coloc <- read_tsv(inFile)

all_coloc$distance <- abs(all_coloc$GWAS_pos - all_coloc$QTL_pos)
all_coloc$distance_filter <- (!is.na(all_coloc$LD) & all_coloc$LD > 0.1) | all_coloc$distance < 5e5

qtl_coloc_input <- all_coloc %>%
filter(distance_filter == TRUE & PP.H4.abf > PP4) %>%
select(GWAS, QTL, locus, feature) %>%
distinct()

message(" * ", nrow(qtl_coloc_input), " phenotypes colocalize at PP4 > ", PP4)
stopifnot( nrow(qtl_coloc_input) > 0)


shared_loci <- select(qtl_coloc_input, GWAS, locus, QTL) %>%
distinct() %>%
group_by(GWAS, locus) %>%
tally() %>%
filter(n > 1)
message(" * ", nrow(shared_loci), " loci are shared between QTL types")
stopifnot( nrow(shared_loci) > 0)

qtl_coloc_input <- filter(qtl_coloc_input, locus %in% shared_loci$locus)

# get all pairwise comparisons required
# make optional - do you want to test features within a dataset or just across?
qtl_coloc_split <- qtl_coloc_input %>%
split(paste0(.$GWAS, ":", .$locus)) %>%
map_df( ~{
d <- select(.x, GWAS,locus, QTL, feature) %>% distinct()
qtl_pairs <- combn(unique(paste(d$QTL, d$feature) ), 2)
locus_df <- select(d, GWAS, locus)
pair_df <-
data.frame(qtl_1 = qtl_pairs[1,], qtl_2 = qtl_pairs[2,] ) %>%
tidyr::separate(qtl_1, into = c("qtl_1", "feature_1"), sep = " ") %>%
tidyr::separate(qtl_2, into = c("qtl_2", "feature_2"), sep = " ") %>%
mutate(GWAS = unique(d$GWAS), locus = unique(d$locus)) %>%
select(GWAS, locus, everything() )

}) %>%
distinct()

if( mode == "across"){

qtl_coloc_split <- qtl_coloc_split %>%
mutate( within = qtl_1 == qtl_2) %>% filter(within == FALSE)
}

write_tsv(qtl_coloc_split, outFile) #"test_eQTL_edQTL_pairwise_input.tsv")




Loading