ICGC-ARGO-Structural-Variation-CN-WG
diff --git a/‎facets/.dockerignore‎
Lines changed: 5 additions & 0 deletions b/‎facets/.dockerignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎facets/Dockerfile‎
Lines changed: 23 additions & 0 deletions b/‎facets/Dockerfile‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎facets/README.md‎
Lines changed: 83 additions & 0 deletions b/‎facets/README.md‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎facets/facetsRun.R‎
Lines changed: 224 additions & 0 deletions b/‎facets/facetsRun.R‎
Lines changed: 224 additions & 0 deletions
@@ -0,0 +1,5 @@
+.gitignore
+.nextflow*
+tests
+work
+outdir
@@ -0,0 +1,23 @@
+FROM continuumio/miniconda3:4.9.2
+
+# filled by wfpm
+LABEL org.opencontainers.image.source https://github.com/icgc-argo-structural-variation-cn-wg/icgc-argo-sv-copy-number
+
+# add ps (required by nextflow)
+RUN apt-get --allow-releaseinfo-change update  && \
+    apt-get install -y procps && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install facets and dependencies
+RUN /opt/conda/bin/conda install --yes -c conda-forge r-base=4.0.3 r-optparse r-rcolorbrewer r-plyr r-dplyr r-tidyr r-stringr r-magrittr r-foreach
+RUN /opt/conda/bin/conda install --yes -c bioconda r-facets=0.6.1 snp-pileup=0.6.1
+
+# Add main wrapper:
+RUN mkdir -p /tools
+ENV PATH="/tools:${PATH}"
+COPY facetsRun.R /tools/
+
+ENTRYPOINT ["/usr/bin/env"]
+
+CMD ["/bin/bash"]
@@ -0,0 +1,83 @@
+# FACETS
+
+FACETS (Fraction and Allele specific Copy number Estimate from Tumor/normal Sequencing) infers allele-specific  DNA copy number and clonal heterogeneity from high-throughput sequencing including whole-genome, whole-exome, and some targeted cancer gene panels. The method implements a bivariate genome segmentation, followed by allele-specific copy number calls. Tumor purity,ploidy, and cellular fractions are estimated and reported from the output. This tool is useful to simplify large-scale application providing comprehensive output, and integrated visualization.
+
+Read more: [https://github.com/mskcc/facets/](https://github.com/mskcc/facets/)
+
+## Usage
+
+The typical command for running the pipeline is as follows:
+
+```
+nextflow run wes-postproc/modules/facets --input input.txt -profile cluster,singularity
+```
+
+Mandatory arguments:
+```
+    --input         Tab delimited file (no header), with paths to following files:
+                    tumor_ID    normal_ID    tumor.bam    normal.bam    target.dbsnp
+```
+
+Optional arguments:
+```
+    --snp_pileup    Full path to the folder containing the snp_pileup files (you might want to use this when re-running facets)
+    --summaryPrefix Prefix for the summary files [all.geneCN]
+    --q             (snp-pileup) Sets the minimum threshold for mapping quality [1]
+    --Q             (snp-pileup) Sets the minimum threshold for base quality [13]
+    --r             (snp-pileup) Comma separated list of minimum read counts for a position to be output [25,0]
+    --d             (snp-pileup) Sets the maximum depth [1000]
+    --genome        Genome build (b37, GRCh37, hg19, mm9, mm10, GRCm38, hg38). [hg38]
+    --seed          [1234]
+    --snp_nbhd      Window size [250]
+    --minNDepth     Minimum depth in normal to keep the position [25]
+    --maxNDepth     Maximum depth in normal to keep the position [1000]
+    --pre_cval      Pre-processing critical value [cval1 - 50]
+    --cval1         Critical value for estimating diploid log Ratio [200]
+    --cval2         Starting critical value for segmentation (increases by 25 until success) [cval1 - 50]
+    --max_cval      Maximum critical value for segmentation (increases by 25 until success) [5000]
+    --min_nhet      Minimum number of heterozygote snps in a segment used for bivariate t-statistic during clustering of segment [25]
+    --unmatched     Is it unmatched? [FALSE]
+    --minGC         Min GC of position [0]
+    --maxGC         Max GC of position [1]
+```
+
+## Output
+```
+./facets_out/snp_pileup .................. pileup files for every sample.
+    {tumor_id}__{normal_id}__q{params.q}_Q{params.Q}_d{params.maxNDepth}_r{params.r}.bc.gz
+
+
+
+./facets_out/cval1{params.cval1} .......... FACETS results for every sample.
+    {tumor_id}__{normal_id}.cncf.pdf ...... genome-wide profile. Figures:
+                                            log-ratio: logR  with  chromosomes  alternating  in  blue  and gray. The green line indicates the median logR in the sample. The purple line indicates the logR of the diploid state.
+                                            log-odds-ratio: Segment means are ploted in red lines.
+                                            copy number (em): plots the total (black) and minor (red) copy number for each segment.
+                                            cf-em: shows the associated cellular fraction (cf). Dark blue indicates high cf. Light blue indicates low cf. Beige indicates a normal segment (total=2,minor=1).
+    {tumor_id}__{normal_id}.cncf.txt ...... FACETS result table. The columns are:
+                                            chrom: the chromosome to which the segment belongs.seg: the segment number.
+                                            num.mark: the number of SNPs in the segment.
+                                            nhet: the number of SNPs that are deemed heterozygous.
+                                            cnlr.median: the median log-ratio of the segment.
+                                            mafR: the log-odds-ratio summary for the segment (close to zero means the alleles are in balance).
+                                            segclust: the segment cluster to which segment belongs.
+                                            cnlr.median.clust: the median log-ratio of the segment cluster.
+                                            mafR.clust: the log-odds-ratio summary for the segment cluster.
+                                            cf.em: the cellular fraction of the segment.
+                                            tcn.em: the total copy number of the segment.
+                                            lcn.em: the minor copy number of the segment.
+    {tumor_id}__{normal_id}.logR.pdf ...... genome-wide profile log-ratio only.
+    {tumor_id}__{normal_id}.out ........... result summary file and log
+    {tumor_id}__{normal_id}.Rdata ......... FACETS R session
+
+```
+
+## Fetching the singularity container
+```
+bash scripts/fetch_image.sh
+```
+
+## Fetching resource files
+```
+bash scripts/fetch_resources.sh
+```
@@ -0,0 +1,224 @@
+#!/usr/bin/env Rscript
+
+# run the facets library
+
+# Version changelog:
+# v2:
+#  Sourcing runFacets_myplot.R from the same folder of this script, wherever that might be.
+# v2.1:
+#  Added '--tumorName' and '--normalName' options to account for different naming schemes.
+#  Account for the possibility that '--cval2' and '--pre_cval' are passed with a string 'NULL'
+# v3:
+#  set seed
+#  use a default pre_cval
+#  use only one cval (remove cval2; cval1 -> cval)
+#  increase cval by 50 if hyperfragmented (save as additional result files).
+#  add max_segs to define hyperfragmentation.
+# v3.icgc-argo:
+#  remove normalName
+#  no cval increase steps
+#  omit runFacets_myplot.R and plotting only logR.
+
+suppressPackageStartupMessages(library("optparse"));
+suppressPackageStartupMessages(library("RColorBrewer"));
+suppressPackageStartupMessages(library("plyr"));
+suppressPackageStartupMessages(library("dplyr"));
+suppressPackageStartupMessages(library("tidyr"));
+suppressPackageStartupMessages(library("stringr"));
+suppressPackageStartupMessages(library("magrittr"));
+suppressPackageStartupMessages(library("facets"));
+suppressPackageStartupMessages(library("foreach"));
+
+
+
+
+if (!interactive()) {
+    options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
+}
+
+optList <- list(
+	make_option("--seed", default = 1234, type = 'integer', help = "seed for reproducibility"),
+	make_option("--snp_nbhd", default = 250, type = 'integer', help = "window size"),
+	make_option("--minNDepth", default = 5, type = 'integer', help = "minimum depth in normal to keep the position"),
+	make_option("--maxNDepth", default= 500, type= 'integer', help = "maximum depth in normal to keep the position"),
+	make_option("--pre_cval", default = 80, type = 'integer', help = "pre-processing critical value"),
+	make_option("--cval", default = NULL, type = 'integer', help = "critical value for estimating diploid log Ratio"),
+	make_option("--max_cval", default = 5000, type = 'integer', help = "maximum critical value for segmentation (increases by 100 until success)"),
+	make_option("--min_nhet", default = 25, type = 'integer', help = "minimum number of heterozygote snps in a segment used for bivariate t-statistic during clustering of segment"),
+	make_option("--genome", default = 'hg38', type = 'character', help = "genome of counts file"),
+	make_option("--unmatched", default=FALSE, type=NULL,  help="is it unmatched?"),
+	make_option("--minGC", default = 0, type = NULL, help = "min GC of position"),
+	make_option("--maxGC", default = 1, type = NULL, help = "max GC of position"),
+	make_option("--max_segs", default = 3000, type = 'integer', help = "max number of segments to avoid hyperfragmentation"),
+	make_option("--outPrefix", default = NULL, help = "output prefix"),
+	make_option("--tumorName", default = NULL, help = "tumorName")
+)
+
+parser <- OptionParser(usage = "%prog [options] [tumor-normal base counts file]", option_list = optList);
+
+arguments <- parse_args(parser, positional_arguments = T);
+opt <- arguments$options;
+
+if (length(arguments$args) < 1) {
+    cat("Need base counts file\n")
+    print_help(parser);
+    stop();
+} else if (is.null(opt$outPrefix)) {
+    cat("Need output prefix\n")
+    print_help(parser);
+    stop();
+} else if (is.null(opt$tumorName)) {
+    cat("Need tumorName\n")
+    print_help(parser);
+    stop();
+} else {
+    baseCountFile <- arguments$args[1];
+}
+
+# Print input file and the options
+cat("\nInput file:\n",baseCountFile,"\n")
+cat("\nOptions:\n")
+for(i in 1:length(opt))
+{
+	cat("",names(opt[i]), "=", head(opt[[i]],1),"\n")
+}
+cat("\n")
+
+switch(opt$genome,
+	b37={gbuild="hg19"},
+	b37_hbv_hcv={gbuild="hg19"},
+	GRCh37={gbuild="hg19"},
+	hg19={gbuild="hg19"},
+	hg19_ionref={gbuild="hg19"},
+	mm9={gbuild="mm9"},
+	mm10={gbuild="mm10"},
+	GRCm38={gbuild="mm10"},
+	hg38={gbuild="hg38"},
+       { stop(paste("Invalid Genome",opt$genome)) })
+
+buildData=installed.packages()["facets",]
+cat("#Module Info\n")
+for(fi in c("Package","LibPath","Version","Built")){
+    cat("#",paste(fi,":",sep=""),buildData[fi],"\n")
+}
+version=buildData["Version"]
+cat("\n")
+
+rcmat <- readSnpMatrix(gzfile(baseCountFile))
+chromLevels=unique(rcmat[,1])
+print(chromLevels)
+if (gbuild %in% c("hg19", "hg18")) { chromLevels=intersect(chromLevels, c(1:22,"X"))
+} else { chromLevels=intersect(chromLevels, c(1:19,"X"))}
+print(chromLevels)
+
+if(is.null(opt$cval)) { stop("cval cannot be NULL")}
+
+set.seed(opt$seed)
+
+if (opt$minGC == 0 & opt$maxGC == 1) {
+	preOut=preProcSample(rcmat, snp.nbhd = opt$snp_nbhd, ndepth = opt$minNDepth, cval = opt$pre_cval, 
+		gbuild=gbuild, ndepthmax=opt$maxNDepth, unmatched=opt$unmatched)
+} else {
+	if (gbuild %in% c("hg19", "hg18", "hg38"))
+		nX <- 23
+	if (gbuild %in% c("mm9", "mm10"))
+	 nX <- 20
+	pmat <- facets:::procSnps(rcmat, ndepth=opt$minNDepth, het.thresh = 0.25, snp.nbhd = opt$snp_nbhd, 
+		gbuild=gbuild, unmatched=opt$unmatched, ndepthmax=opt$maxNDepth)
+	dmat <- facets:::counts2logROR(pmat[pmat$rCountT > 0, ], gbuild, unmatched=opt$unmatched)
+        dmat$keep[which(dmat$gcpct>=opt$maxGC | dmat$gcpct<=opt$minGC)] <- 0
+	dmat <- dmat[dmat$keep == 1,]
+	tmp1 <- facets:::segsnps(dmat, opt$pre_cval, hetscale=F)
+	pmat$keep <- 0
+	pmat$keep[which(paste(pmat$chrom, pmat$maploc, sep="_") %in% paste(dmat$chrom, dmat$maploc, sep="_"))] <- 1
+
+	tmp2 <- list(pmat = pmat, gbuild=gbuild, nX=nX)
+	preOut <- c(tmp2,tmp1)
+}
+
+formatSegmentOutput <- function(out,sampID) {
+	seg=list()
+	seg$ID=rep(sampID,nrow(out$out))
+	seg$chrom=out$out$chr
+	seg$loc.start=rep(NA,length(seg$ID))
+	seg$loc.end=seg$loc.start
+	seg$num.mark=out$out$num.mark
+	seg$seg.mean=out$out$cnlr.median
+	for(i in 1:nrow(out$out)) {
+		lims=range(out$jointseg$maploc[(out$jointseg$chrom==out$out$chr[i] & out$jointseg$seg==out$out$seg[i])],na.rm=T)
+		seg$loc.start[i]=lims[1]
+		seg$loc.end[i]=lims[2]
+	}	
+	as.data.frame(seg)
+}
+
+out <- preOut %>% procSample(cval = opt$cval, min.nhet = opt$min_nhet)
+
+cat ("Completed preProc and proc\n")
+cat ("procSample FLAG is", out$FLAG, "\n")
+
+# save all objects except pileup
+save(file = str_c(opt$outPrefix, ".Rdata"), list = ls()[!grepl("^rcmat", ls())],  compress=T)
+
+# Run emncf, don't break if error:
+print(str_c("attempting to run emncf() with cval = ", opt$cval))
+fit <- tryCatch({
+	out %>% emcncf
+}, error = function(e) {
+	print(paste("Error:", e))
+	return(NULL)
+})
+if (!is.null(fit)) {
+	cat ("emcncf was successful with cval", opt$cval, "\n")
+	
+	# make a table viewable in IGV
+	out$IGV = formatSegmentOutput(out, opt$tumorName)
+	
+	# plot facets results
+	if(sum(out$out$num.mark)<=10000) { height=4; width=7} else { height=6; width=9}
+	pdf(file = str_c(opt$outPrefix, ".cncf.pdf"), height = height, width = width)
+	plotSample(out, fit)
+	dev.off()
+	
+	# save cncf table
+	write.table(fit$cncf, str_c(opt$outPrefix, ".cncf.txt"), row.names = F, quote = F, sep = '\t')
+	
+	# save results and metrics
+	ff = str_c(opt$outPrefix, ".out")
+	cat("# Version =", version, "\n", file = ff, append = T)
+	cat("# Input =", basename(baseCountFile), "\n", file = ff, append = T)
+	cat("# tumor =", opt$tumorName, "\n", file = ff, append = T)
+	cat("# snp.nbhd =", opt$snp_nbhd, "\n", file = ff, append = T)
+	cat("# cval =", opt$cval, "\n", file = ff, append = T)
+	cat("# min.nhet =", opt$min_nhet, "\n", file = ff, append = T)
+	cat("# genome =", opt$genome, "\n", file = ff, append = T)
+	cat("# Purity =", fit$purity, "\n", file = ff, append = T)
+	cat("# Ploidy =", fit$ploidy, "\n", file = ff, append = T)
+	cat("# dipLogR =", fit$dipLogR, "\n", file = ff, append = T)
+	cat("# dipt =", fit$dipt, "\n", file = ff, append = T)
+	cat("# loglik =", fit$loglik, "\n", file = ff, append = T)
+
+} else {
+	cat ("emcncf failed with cval", opt$cval, "\n")
+	fit <- NULL
+	ff = str_c(opt$outPrefix, ".out")
+	cat("# Version =", version, "\n", file = ff, append = T)
+	cat("# Input =", basename(baseCountFile), "\n", file = ff, append = T)
+	cat("# tumor =", opt$tumorName, "\n", file = ff, append = T)
+	cat("# snp.nbhd =", opt$snp_nbhd, "\n", file = ff, append = T)
+	cat("# cval =", opt$cval, "\n", file = ff, append = T)
+	cat("# min.nhet =", opt$min_nhet, "\n", file = ff, append = T)
+	cat("# genome =", opt$genome, "\n", file = ff, append = T)
+	cat("# Purity =", "failed", "\n", file = ff, append = T)
+	cat("# Ploidy =", "failed", "\n", file = ff, append = T)
+	cat("# dipLogR =", "failed", "\n", file = ff, append = T)
+	cat("# dipt =", "failed", "\n", file = ff, append = T)
+	cat("# loglik =", "failed", "\n", file = ff, append = T)
+}
+
+# save all objects except pileup
+save(file = str_c(opt$outPrefix, ".Rdata"), list = ls()[!grepl("^rcmat", ls())],  compress=T)
+
+
+warnings()
+
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +.gitignore
 +.nextflow*
 +tests
 +work
 +outdir