|
| 1 | +#!/usr/bin/env Rscript |
| 2 | + |
| 3 | +# run the facets library |
| 4 | + |
| 5 | +# Version changelog: |
| 6 | +# v2: |
| 7 | +# Sourcing runFacets_myplot.R from the same folder of this script, wherever that might be. |
| 8 | +# v2.1: |
| 9 | +# Added '--tumorName' and '--normalName' options to account for different naming schemes. |
| 10 | +# Account for the possibility that '--cval2' and '--pre_cval' are passed with a string 'NULL' |
| 11 | +# v3: |
| 12 | +# set seed |
| 13 | +# use a default pre_cval |
| 14 | +# use only one cval (remove cval2; cval1 -> cval) |
| 15 | +# increase cval by 50 if hyperfragmented (save as additional result files). |
| 16 | +# add max_segs to define hyperfragmentation. |
| 17 | +# v3.icgc-argo: |
| 18 | +# remove normalName |
| 19 | +# no cval increase steps |
| 20 | +# omit runFacets_myplot.R and plotting only logR. |
| 21 | + |
| 22 | +suppressPackageStartupMessages(library("optparse")); |
| 23 | +suppressPackageStartupMessages(library("RColorBrewer")); |
| 24 | +suppressPackageStartupMessages(library("plyr")); |
| 25 | +suppressPackageStartupMessages(library("dplyr")); |
| 26 | +suppressPackageStartupMessages(library("tidyr")); |
| 27 | +suppressPackageStartupMessages(library("stringr")); |
| 28 | +suppressPackageStartupMessages(library("magrittr")); |
| 29 | +suppressPackageStartupMessages(library("facets")); |
| 30 | +suppressPackageStartupMessages(library("foreach")); |
| 31 | + |
| 32 | + |
| 33 | + |
| 34 | + |
| 35 | +if (!interactive()) { |
| 36 | + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) |
| 37 | +} |
| 38 | + |
| 39 | +optList <- list( |
| 40 | + make_option("--seed", default = 1234, type = 'integer', help = "seed for reproducibility"), |
| 41 | + make_option("--snp_nbhd", default = 250, type = 'integer', help = "window size"), |
| 42 | + make_option("--minNDepth", default = 5, type = 'integer', help = "minimum depth in normal to keep the position"), |
| 43 | + make_option("--maxNDepth", default= 500, type= 'integer', help = "maximum depth in normal to keep the position"), |
| 44 | + make_option("--pre_cval", default = 80, type = 'integer', help = "pre-processing critical value"), |
| 45 | + make_option("--cval", default = NULL, type = 'integer', help = "critical value for estimating diploid log Ratio"), |
| 46 | + make_option("--max_cval", default = 5000, type = 'integer', help = "maximum critical value for segmentation (increases by 100 until success)"), |
| 47 | + make_option("--min_nhet", default = 25, type = 'integer', help = "minimum number of heterozygote snps in a segment used for bivariate t-statistic during clustering of segment"), |
| 48 | + make_option("--genome", default = 'hg38', type = 'character', help = "genome of counts file"), |
| 49 | + make_option("--unmatched", default=FALSE, type=NULL, help="is it unmatched?"), |
| 50 | + make_option("--minGC", default = 0, type = NULL, help = "min GC of position"), |
| 51 | + make_option("--maxGC", default = 1, type = NULL, help = "max GC of position"), |
| 52 | + make_option("--max_segs", default = 3000, type = 'integer', help = "max number of segments to avoid hyperfragmentation"), |
| 53 | + make_option("--outPrefix", default = NULL, help = "output prefix"), |
| 54 | + make_option("--tumorName", default = NULL, help = "tumorName") |
| 55 | +) |
| 56 | + |
| 57 | +parser <- OptionParser(usage = "%prog [options] [tumor-normal base counts file]", option_list = optList); |
| 58 | + |
| 59 | +arguments <- parse_args(parser, positional_arguments = T); |
| 60 | +opt <- arguments$options; |
| 61 | + |
| 62 | +if (length(arguments$args) < 1) { |
| 63 | + cat("Need base counts file\n") |
| 64 | + print_help(parser); |
| 65 | + stop(); |
| 66 | +} else if (is.null(opt$outPrefix)) { |
| 67 | + cat("Need output prefix\n") |
| 68 | + print_help(parser); |
| 69 | + stop(); |
| 70 | +} else if (is.null(opt$tumorName)) { |
| 71 | + cat("Need tumorName\n") |
| 72 | + print_help(parser); |
| 73 | + stop(); |
| 74 | +} else { |
| 75 | + baseCountFile <- arguments$args[1]; |
| 76 | +} |
| 77 | + |
| 78 | +# Print input file and the options |
| 79 | +cat("\nInput file:\n",baseCountFile,"\n") |
| 80 | +cat("\nOptions:\n") |
| 81 | +for(i in 1:length(opt)) |
| 82 | +{ |
| 83 | + cat("",names(opt[i]), "=", head(opt[[i]],1),"\n") |
| 84 | +} |
| 85 | +cat("\n") |
| 86 | + |
| 87 | +switch(opt$genome, |
| 88 | + b37={gbuild="hg19"}, |
| 89 | + b37_hbv_hcv={gbuild="hg19"}, |
| 90 | + GRCh37={gbuild="hg19"}, |
| 91 | + hg19={gbuild="hg19"}, |
| 92 | + hg19_ionref={gbuild="hg19"}, |
| 93 | + mm9={gbuild="mm9"}, |
| 94 | + mm10={gbuild="mm10"}, |
| 95 | + GRCm38={gbuild="mm10"}, |
| 96 | + hg38={gbuild="hg38"}, |
| 97 | + { stop(paste("Invalid Genome",opt$genome)) }) |
| 98 | + |
| 99 | +buildData=installed.packages()["facets",] |
| 100 | +cat("#Module Info\n") |
| 101 | +for(fi in c("Package","LibPath","Version","Built")){ |
| 102 | + cat("#",paste(fi,":",sep=""),buildData[fi],"\n") |
| 103 | +} |
| 104 | +version=buildData["Version"] |
| 105 | +cat("\n") |
| 106 | + |
| 107 | +rcmat <- readSnpMatrix(gzfile(baseCountFile)) |
| 108 | +chromLevels=unique(rcmat[,1]) |
| 109 | +print(chromLevels) |
| 110 | +if (gbuild %in% c("hg19", "hg18")) { chromLevels=intersect(chromLevels, c(1:22,"X")) |
| 111 | +} else { chromLevels=intersect(chromLevels, c(1:19,"X"))} |
| 112 | +print(chromLevels) |
| 113 | + |
| 114 | +if(is.null(opt$cval)) { stop("cval cannot be NULL")} |
| 115 | + |
| 116 | +set.seed(opt$seed) |
| 117 | + |
| 118 | +if (opt$minGC == 0 & opt$maxGC == 1) { |
| 119 | + preOut=preProcSample(rcmat, snp.nbhd = opt$snp_nbhd, ndepth = opt$minNDepth, cval = opt$pre_cval, |
| 120 | + gbuild=gbuild, ndepthmax=opt$maxNDepth, unmatched=opt$unmatched) |
| 121 | +} else { |
| 122 | + if (gbuild %in% c("hg19", "hg18", "hg38")) |
| 123 | + nX <- 23 |
| 124 | + if (gbuild %in% c("mm9", "mm10")) |
| 125 | + nX <- 20 |
| 126 | + pmat <- facets:::procSnps(rcmat, ndepth=opt$minNDepth, het.thresh = 0.25, snp.nbhd = opt$snp_nbhd, |
| 127 | + gbuild=gbuild, unmatched=opt$unmatched, ndepthmax=opt$maxNDepth) |
| 128 | + dmat <- facets:::counts2logROR(pmat[pmat$rCountT > 0, ], gbuild, unmatched=opt$unmatched) |
| 129 | + dmat$keep[which(dmat$gcpct>=opt$maxGC | dmat$gcpct<=opt$minGC)] <- 0 |
| 130 | + dmat <- dmat[dmat$keep == 1,] |
| 131 | + tmp1 <- facets:::segsnps(dmat, opt$pre_cval, hetscale=F) |
| 132 | + pmat$keep <- 0 |
| 133 | + pmat$keep[which(paste(pmat$chrom, pmat$maploc, sep="_") %in% paste(dmat$chrom, dmat$maploc, sep="_"))] <- 1 |
| 134 | + |
| 135 | + tmp2 <- list(pmat = pmat, gbuild=gbuild, nX=nX) |
| 136 | + preOut <- c(tmp2,tmp1) |
| 137 | +} |
| 138 | + |
| 139 | +formatSegmentOutput <- function(out,sampID) { |
| 140 | + seg=list() |
| 141 | + seg$ID=rep(sampID,nrow(out$out)) |
| 142 | + seg$chrom=out$out$chr |
| 143 | + seg$loc.start=rep(NA,length(seg$ID)) |
| 144 | + seg$loc.end=seg$loc.start |
| 145 | + seg$num.mark=out$out$num.mark |
| 146 | + seg$seg.mean=out$out$cnlr.median |
| 147 | + for(i in 1:nrow(out$out)) { |
| 148 | + lims=range(out$jointseg$maploc[(out$jointseg$chrom==out$out$chr[i] & out$jointseg$seg==out$out$seg[i])],na.rm=T) |
| 149 | + seg$loc.start[i]=lims[1] |
| 150 | + seg$loc.end[i]=lims[2] |
| 151 | + } |
| 152 | + as.data.frame(seg) |
| 153 | +} |
| 154 | + |
| 155 | +out <- preOut %>% procSample(cval = opt$cval, min.nhet = opt$min_nhet) |
| 156 | + |
| 157 | +cat ("Completed preProc and proc\n") |
| 158 | +cat ("procSample FLAG is", out$FLAG, "\n") |
| 159 | + |
| 160 | +# save all objects except pileup |
| 161 | +save(file = str_c(opt$outPrefix, ".Rdata"), list = ls()[!grepl("^rcmat", ls())], compress=T) |
| 162 | + |
| 163 | +# Run emncf, don't break if error: |
| 164 | +print(str_c("attempting to run emncf() with cval = ", opt$cval)) |
| 165 | +fit <- tryCatch({ |
| 166 | + out %>% emcncf |
| 167 | +}, error = function(e) { |
| 168 | + print(paste("Error:", e)) |
| 169 | + return(NULL) |
| 170 | +}) |
| 171 | +if (!is.null(fit)) { |
| 172 | + cat ("emcncf was successful with cval", opt$cval, "\n") |
| 173 | + |
| 174 | + # make a table viewable in IGV |
| 175 | + out$IGV = formatSegmentOutput(out, opt$tumorName) |
| 176 | + |
| 177 | + # plot facets results |
| 178 | + if(sum(out$out$num.mark)<=10000) { height=4; width=7} else { height=6; width=9} |
| 179 | + pdf(file = str_c(opt$outPrefix, ".cncf.pdf"), height = height, width = width) |
| 180 | + plotSample(out, fit) |
| 181 | + dev.off() |
| 182 | + |
| 183 | + # save cncf table |
| 184 | + write.table(fit$cncf, str_c(opt$outPrefix, ".cncf.txt"), row.names = F, quote = F, sep = '\t') |
| 185 | + |
| 186 | + # save results and metrics |
| 187 | + ff = str_c(opt$outPrefix, ".out") |
| 188 | + cat("# Version =", version, "\n", file = ff, append = T) |
| 189 | + cat("# Input =", basename(baseCountFile), "\n", file = ff, append = T) |
| 190 | + cat("# tumor =", opt$tumorName, "\n", file = ff, append = T) |
| 191 | + cat("# snp.nbhd =", opt$snp_nbhd, "\n", file = ff, append = T) |
| 192 | + cat("# cval =", opt$cval, "\n", file = ff, append = T) |
| 193 | + cat("# min.nhet =", opt$min_nhet, "\n", file = ff, append = T) |
| 194 | + cat("# genome =", opt$genome, "\n", file = ff, append = T) |
| 195 | + cat("# Purity =", fit$purity, "\n", file = ff, append = T) |
| 196 | + cat("# Ploidy =", fit$ploidy, "\n", file = ff, append = T) |
| 197 | + cat("# dipLogR =", fit$dipLogR, "\n", file = ff, append = T) |
| 198 | + cat("# dipt =", fit$dipt, "\n", file = ff, append = T) |
| 199 | + cat("# loglik =", fit$loglik, "\n", file = ff, append = T) |
| 200 | + |
| 201 | +} else { |
| 202 | + cat ("emcncf failed with cval", opt$cval, "\n") |
| 203 | + fit <- NULL |
| 204 | + ff = str_c(opt$outPrefix, ".out") |
| 205 | + cat("# Version =", version, "\n", file = ff, append = T) |
| 206 | + cat("# Input =", basename(baseCountFile), "\n", file = ff, append = T) |
| 207 | + cat("# tumor =", opt$tumorName, "\n", file = ff, append = T) |
| 208 | + cat("# snp.nbhd =", opt$snp_nbhd, "\n", file = ff, append = T) |
| 209 | + cat("# cval =", opt$cval, "\n", file = ff, append = T) |
| 210 | + cat("# min.nhet =", opt$min_nhet, "\n", file = ff, append = T) |
| 211 | + cat("# genome =", opt$genome, "\n", file = ff, append = T) |
| 212 | + cat("# Purity =", "failed", "\n", file = ff, append = T) |
| 213 | + cat("# Ploidy =", "failed", "\n", file = ff, append = T) |
| 214 | + cat("# dipLogR =", "failed", "\n", file = ff, append = T) |
| 215 | + cat("# dipt =", "failed", "\n", file = ff, append = T) |
| 216 | + cat("# loglik =", "failed", "\n", file = ff, append = T) |
| 217 | +} |
| 218 | + |
| 219 | +# save all objects except pileup |
| 220 | +save(file = str_c(opt$outPrefix, ".Rdata"), list = ls()[!grepl("^rcmat", ls())], compress=T) |
| 221 | + |
| 222 | + |
| 223 | +warnings() |
| 224 | + |
0 commit comments