diff --git a/DESCRIPTION b/DESCRIPTION index e754ae73..02642479 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,14 +2,15 @@ Package: methylKit Type: Package Title: DNA methylation analysis from high-throughput bisulfite sequencing results -Version: 1.7.4 +Version: 1.7.6 Authors@R: c(person("Altuna", "Akalin", role = c("aut", "cre"), email = "aakalin@gmail.com"), person("Matthias","Kormaksson", role = "aut"), person("Sheng","Li", role = "aut"), person("Arsene", "Wabo", role = "ctb"), person("Adrian", "Bierling", role= "aut"), - person("Alexander","Gosdschan", role="aut")) + person("Alexander","Gosdschan", role="aut"), + person("Katarzyna","Wreczycka", role="ctb")) Author: Altuna Akalin [aut, cre], Matthias Kormaksson [aut], Sheng Li [aut], Arsene Wabo [ctb], Adrian Bierling [aut], Alexander Gosdschan [aut] diff --git a/NEWS b/NEWS index 079f646f..b0aecf4b 100644 --- a/NEWS +++ b/NEWS @@ -1,8 +1,23 @@ +methylKit 1.7.6 +-------------- +NEW FUNCTIONS AND FEATURES +* methSeg(): introduce parameter `cores` to run fastseg in parallel + per chromosome; update description; add tests + + +methylKit 1.7.5 +-------------- +NEW FUNCTIONS AND FEATURES +* methSeg(): introduce parameter `initialize.on.subset` to subset data + for initialization of mixture modeling; update description; add tests + + methylKit 1.7.4 -------------- IMPROVEMENTS AND BUG FIXES * update link to test-file for methSeg() function + methylKit 1.7.3 -------------- IMPROVEMENTS AND BUG FIXES diff --git a/R/methSeg.R b/R/methSeg.R index dfbc240c..687ad47b 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -1,5 +1,121 @@ # segmentation functions +#' # Segment methylation calls. +#' Run fastseg GRanges object `obj` +#' with additional parameters in `...`. +.run.fastseg = function(obj, ...){ + + dots <- list(...) + + ## check wether obj contains at least one metacol + if(ncol(elementMetadata(obj))<1) + stop("GRanges does not have any meta column.") + + ## check wether obj contains is sorted by position + if(is.unsorted(obj,ignore.strand=TRUE)) { + obj <- sort(obj,ignore.strand=TRUE) + message("Object not sorted by position, sorting now.") + } + + ## check wether obj contains at least two ranges else stop + if(length(obj)<=1) + stop("segmentation requires at least two ranges.") + + # match argument names to fastseg arguments + args.fastseg=dots[names(dots) %in% names(formals(fastseg)[-1] ) ] + + args.fastseg[["x"]]=obj + + # do the segmentation + #seg.res=fastseg(obj) + seg.res <- do.call("fastseg", args.fastseg) + + # stop if segmentation produced only one range + if(length(seg.res)==1) { + warning("segmentation produced only one range, no mixture modeling possible.") + seg.res$seg.group <- "1" + } + return(seg.res) +} + +#' # Use mixture modeling for the density function estimated +#' # and BIC criterion used to decide the optimum number of components +#' # in mixture modeling. +#' Run mclust::densityMclust using GRanges object `seg.res`, look for description +#' of `join.neighbours`, `initialize.on.subset` and other parameters +#' in methylKit methSeg. +.run.mclust = function(seg.res, diagnostic.plot=TRUE, join.neighbours=FALSE, + initialize.on.subset=1, ...){ + + dots <- list(...) + + # match argument names to Mclust + args.Mclust=dots[names(dots) %in% names(formals(Mclust)[-1]) ] + + # if joining, do not show first clustering + if(join.neighbours) { + diagnostic.plot.old = diagnostic.plot + diagnostic.plot = FALSE + } + + # Run subsampling of segments before performing the mixture modeling + if("initialization" %in% names(args.Mclust)){ + if("subset" %in% names(args.Mclust[["initialization"]])) { + if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){ + stop("too few samples, increase the size of subset.") + } + message(paste("initializing clustering with", + length(args.Mclust[["initialization"]][["subset"]]), + "segments.")) + initialize.on.subset = 1 + } + } + + if(initialize.on.subset != 1 && initialize.on.subset > 0 ) { + + if( initialize.on.subset > 0 & initialize.on.subset < 1 ) + nbr.sample = floor(length(seg.res) * initialize.on.subset) + + if( initialize.on.subset > 1) + nbr.sample = initialize.on.subset + + if( nbr.sample < 9 ){stop("too few samples, increase the size of subset.") } + + message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments.")) + # estimate parameters for mclust + sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE) + args.Mclust[["initialization"]]=list(subset = sub) + } + + # decide on number of components/groups + args.Mclust[["score.gr"]]=seg.res + args.Mclust[["diagnostic.plot"]]=diagnostic.plot + dens=do.call("densityFind", args.Mclust ) + + # add components/group ids + mcols(seg.res)$seg.group=as.character(dens$classification) + + # if joining, show clustering after joining + if(join.neighbours) { + message("joining neighbouring segments and repeating clustering.") + seg.res <- joinSegmentNeighbours(seg.res) + diagnostic.plot <- diagnostic.plot.old + + # get the new density + args.Mclust[["score.gr"]]=seg.res + args.Mclust[["diagnostic.plot"]]=diagnostic.plot + # skip second progress bar + args.Mclust[["verbose"]]=FALSE + # do not run subsampling of segments + args.Mclust[["initialization"]]<-NULL + + dens=do.call("densityFind", args.Mclust ) + + } + return(seg.res) +} + + #' Segment methylation or differential methylation profile #' #' The function uses a segmentation algorithm (\code{\link[fastseg]{fastseg}}) @@ -23,6 +139,15 @@ #' @param join.neighbours if TRUE neighbouring segments that cluster to the same #' seg.group are joined by extending the ranges, summing up num.marks and #' averaging over seg.means. +#' @param initialize.on.subset a numeric value indicating either percentage or +#' absolute value of regions to subsample from segments before performing +#' the mixture modeling. The value can be either between 0 and 1, +#' e.g. 0.1 means that 10% of data will be used for sampling, or be an +#' integer higher than 1 to define the number of regions to sample. +#' By default uses the whole dataset, which can be time consuming on +#' large datasets. (Default: 1) +#' @param mc.cores number of cores. The function is parallelized per chromosome +#' (default: 1). #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg #' package, or to \code{\link[mclust]{densityMclust}} #' in Mclust package, could be used to fine tune the segmentation algorithm. @@ -42,10 +167,20 @@ #' @details #' To be sure that the algorithm will work on your data, #' the object should have at least 5000 records +#' +#' @details After initial segmentation with fastseg(), segments are clustered +#' into self-similar groups based on their mean methylation profile +#' using mixture modeling. Mixture modeling estimates the initial +#' parameters of the distributions by using the whole dataset. +#' If "initialize.on.subset" argument set as described, the function +#' will use a subset of the data to initialize the parameters of the +#' mixture modeling prior to the Expectation-maximization (EM) algorithm +#' used by \code{\link{Mclust}} package. +#' #' @examples #' #' \donttest{ -#' download.file("https://github.com/BIMSBbioinfo/compgen2018/raw/master/day3_diffMeth/data/H1.chr21.chr22.rds", +#' download.file("https://dl.dropboxusercontent.com/u/1373164/H1.chr21.chr22.rds", #' destfile="H1.chr21.chr22.rds",method="curl") #' #' mbw=readRDS("H1.chr21.chr22.rds") @@ -62,103 +197,128 @@ #' #' unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE)) #' -#' @author Altuna Akalin, contributions by Arsene Wabo +#' @author Altuna Akalin, contributions by Arsene Wabo and Katarzyna +#' Wreczycka #' #' @seealso \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} #' #' @export #' @docType methods #' @rdname methSeg -methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, ...){ +methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, + initialize.on.subset=1, + mc.cores=1, ...){ - dots <- list(...) + # 1. Run segmentation using fastseg - - ##coerce object to granges - if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") { + if(mc.cores==1){ + + ##coerce object to granges + if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") { obj= as(obj,"GRanges") ## calculate methylation score mcols(obj)$meth=100*obj$numCs/obj$coverage ## select only required mcol obj = obj[,"meth"] - }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") { + }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") { obj = as(obj,"GRanges") ## use methylation difference as score obj = obj[,"meth.diff"] - }else if (class(obj) != "GRanges"){ - stop("only methylRaw or methylDiff objects ", - "or GRanges objects can be used in this function") - } - - # destrand - strand(obj) <- "*" - - ## check wether obj contains at least one metacol - if(ncol(elementMetadata(obj))<1) - stop("GRanges does not have any meta column.") - - ## check wether obj contains is sorted by position - if(is.unsorted(obj,ignore.strand=TRUE)) { - obj <- sort(obj,ignore.strand=TRUE) - message("Object not sorted by position, sorting now.") - } - - ## check wether obj contains at least two ranges else stop - if(length(obj)<=1) - stop("segmentation requires at least two ranges.") - - # match argument names to fastseg arguments - args.fastseg=dots[names(dots) %in% names(formals(fastseg)[-1] ) ] - - # match argument names to Mclust - args.Mclust=dots[names(dots) %in% names(formals(Mclust)[-1]) ] - - args.fastseg[["x"]]=obj - - # do the segmentation - #seg.res=fastseg(obj) - seg.res <- do.call("fastseg", args.fastseg) - #seg.res <- do.call("fastseg2", args.fastseg) - - # stop if segmentation produced only one range - if(length(seg.res)==1) { - warning("segmentation produced only one range, no mixture modeling possible.") - seg.res$seg.group <- "1" - return(seg.res) - } - - # if joining, do not show first clustering - if(join.neighbours) { - diagnostic.plot.old = diagnostic.plot - diagnostic.plot = FALSE + }else if (class(obj) != "GRanges"){ + stop("only methylRaw or methylDiff objects ", + "or GRanges objects can be used in this function") + } + + # destrand + strand(obj) <- "*" + + # Run segmentation + seg.res = .run.fastseg(obj) } - # decide on number of components/groups - args.Mclust[["score.gr"]]=seg.res - args.Mclust[["diagnostic.plot"]]=diagnostic.plot - dens=do.call("densityFind", args.Mclust ) - - # add components/group ids - mcols(seg.res)$seg.group=as.character(dens$classification) - - # if joining, show clustering after joining - if(join.neighbours) { - message("joining neighbouring segments and repeating clustering.") - seg.res <- joinSegmentNeighbours(seg.res) - diagnostic.plot <- diagnostic.plot.old + if(mc.cores>1){ + + ## Non-tabix files + if (class(obj)=="methylDiff" | class(obj)=="methylRaw" | + class(obj)=="GRanges"){ - # get the new density - args.Mclust[["score.gr"]]=seg.res - args.Mclust[["diagnostic.plot"]]=diagnostic.plot - # skip second progress bar - args.Mclust[["verbose"]]=FALSE - dens=do.call("densityFind", args.Mclust ) + ##coerce object to granges + if(class(obj)=="methylRaw") { + obj= as(obj,"GRanges") + ## calculate methylation score + mcols(obj)$meth=100*obj$numCs/obj$coverage + ## select only required mcol + obj = obj[,"meth"] + } else if(class(obj)=="methylDiff") { + obj = as(obj,"GRanges") + ## use methylation difference as score + obj = obj[,"meth.diff"] + }else if (class(obj) != "GRanges"){ + stop("only methylRaw or methylDiff objects ", + "or GRanges objects can be used in this function") + } + # destrand + strand(obj) <- "*" + + # keep only seqlevels that are non-empty + seqlevels(obj) <- seqlevelsInUse(obj) + chrs = seqlevels(obj) + # Split the input object based on chromosome + # run methSeg on chromosomes (Could be in parallel or not) + seg.res.list <- mclapply(chrs, function(chr){ + + # Run segmentation + seg.res = .run.fastseg(obj[seqnames(obj)==chr]) + }, mc.cores=mc.cores) + + # Merge resulting segments again to whole genome. + # suppressWarnings -> combined objs dont have chrs in common + seg.res = suppressWarnings( do.call("c", seg.res.list) ) + + ## Tabix files + } else if(class(obj)=="methylDiffDB" | class(obj)=="methylRawDB"){ + + .run.fastseg.tabix = function(gr0, class0, ...){ + + # rename names of meta columns of input GRanges `gr0` to + # methylKit naming convention (such as e.g. coverage, numCs, numTs) + df2getcolnames = as.data.frame(gr0[1]) + df2getcolnames$width = NULL + print(class0) + .setMethylDBNames(df2getcolnames, class0) + names(mcols(gr0)) = colnames(df2getcolnames)[5:ncol(df2getcolnames)] + + # destrand + strand(gr0) <- "*" + + # Run segmentation + .run.fastseg(gr0, ...) + } + # Run segmentation for each chromosome + seg.res <- applyTbxByChr(obj@dbpath, + return.type = "GRanges", + FUN = .run.fastseg.tabix, + class = class(obj), + ..., + mc.cores = mc.cores) + } } - seg.res + # 2. Density Estimation via Model-Based Clustering using + # mclust::densityMclust function + + if( length(seg.res) > 1 ){ + seg.res = .run.mclust(seg.res, + diagnostic.plot=diagnostic.plot, + join.neighbours=join.neighbours, + initialize.on.subset=initialize.on.subset, + ...) + } + return(seg.res) } + # not needed .methSeg<-function(score.gr,score.cols=NULL,...){ #require(fastseg) @@ -232,9 +392,9 @@ diagPlot<-function(dens,score.gr){ #' @docType methods #' @rdname methSeg2bed methSeg2bed<-function(segments,filename, -trackLine="track name='meth segments' description='meth segments' itemRgb=On", -colramp=colorRamp(c("gray","green", "darkgreen")) - ){ + trackLine="track name='meth segments' description='meth segments' itemRgb=On", + colramp=colorRamp(c("gray","green", "darkgreen")) +){ if(class(segments)!="GRanges"){ stop("segments object has to be of class GRanges") } diff --git a/R/tabix.functions.R b/R/tabix.functions.R index d8d2400f..73d37484 100644 --- a/R/tabix.functions.R +++ b/R/tabix.functions.R @@ -484,7 +484,7 @@ applyTbxByChunk<-function(tbxFile,chunk.size=1e6,dir,filename, } -# applyTbxByCHr +# applyTbxByChr #' Apply a function on tabix files chromosome by chromosome #' #' The function reads a tabix file chromosome by chromosome and applies @@ -511,7 +511,8 @@ applyTbxByChunk<-function(tbxFile,chunk.size=1e6,dir,filename, #' @return either a path to a tabix or text file, or a data frame or data.table #' @noRd applyTbxByChr<-function(tbxFile,chrs,dir,filename, - return.type=c("tabix","data.frame","data.table"), + return.type=c("tabix","data.frame","data.table", + "GRanges"), FUN,...,mc.cores=1){ return.type <- match.arg(return.type) @@ -519,7 +520,7 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, if(Sys.info()['sysname']=="Windows") {mc.cores = 1} if(missing(chrs)) { chrs = Rsamtools::seqnamesTabix(tbxFile)} if(return.type =="tabix"){ - + # create a custom function that contains the function # to be applied myFunc<-function(chr,tbxFile,dir,filename,FUN,...){ @@ -539,11 +540,11 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, res=mclapply(chrs,myFunc,tbxFile,dir,filename2,FUN,...,mc.cores = mc.cores) # collect & cat temp files,then make tabix - + path <- catsub2tabix(dir,filename2,filename) return(gsub(".tbi","",path)) - + }else if(return.type=="data.frame"){ @@ -558,7 +559,7 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, # collect and return data.frame(rbindlist(res)) - }else{ + }else if(return.type=="data.table"){ myFunc3<-function(chr,tbxFile,FUN,...){ data=getTabixByChr(chr = chr,tbxFile,return.type="data.table") @@ -569,6 +570,17 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, # collect and return rbindlist(res) + }else{ + myFunc4<-function(chr,tbxFile,FUN,...){ + data=getTabixByChr(chr = chr,tbxFile,return.type="GRanges") + res=FUN(data,...) + } + + res=mclapply(chrs,myFunc4,tbxFile,FUN,...,mc.cores = mc.cores) + + # collect and return + # suppressWarnings -> combined objs dont have chrs in common + suppressWarnings( do.call("c", res) ) } } diff --git a/man/methSeg.Rd b/man/methSeg.Rd index 935e9ff4..e798ab85 100644 --- a/man/methSeg.Rd +++ b/man/methSeg.Rd @@ -5,7 +5,8 @@ \alias{methSeg} \title{Segment methylation or differential methylation profile} \usage{ -methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, ...) +methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, + initialize.on.subset = 1, mc.cores = 1, ...) } \arguments{ \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, @@ -25,6 +26,17 @@ in mixture modeling.} seg.group are joined by extending the ranges, summing up num.marks and averaging over seg.means.} +\item{initialize.on.subset}{a numeric value indicating either percentage or +absolute value of regions to subsample from segments before performing +the mixture modeling. The value can be either between 0 and 1, +e.g. 0.1 means that 10% of data will be used for sampling, or be an +integer higher than 1 to define the number of regions to sample. +By default uses the whole dataset, which can be time consuming on +large datasets. (Default: 1)} + +\item{mc.cores}{number of cores. The function is parallelized per chromosome +(default: 1).} + \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg package, or to \code{\link[mclust]{densityMclust}} in Mclust package, could be used to fine tune the segmentation algorithm. @@ -52,11 +64,20 @@ as high or low methylated regions. \details{ To be sure that the algorithm will work on your data, the object should have at least 5000 records + +After initial segmentation with fastseg(), segments are clustered + into self-similar groups based on their mean methylation profile + using mixture modeling. Mixture modeling estimates the initial + parameters of the distributions by using the whole dataset. + If "initialize.on.subset" argument set as described, the function + will use a subset of the data to initialize the parameters of the + mixture modeling prior to the Expectation-maximization (EM) algorithm + used by \code{\link{Mclust}} package. } \examples{ \donttest{ - download.file("https://github.com/BIMSBbioinfo/compgen2018/raw/master/day3_diffMeth/data/H1.chr21.chr22.rds", + download.file("https://dl.dropboxusercontent.com/u/1373164/H1.chr21.chr22.rds", destfile="H1.chr21.chr22.rds",method="curl") mbw=readRDS("H1.chr21.chr22.rds") @@ -78,5 +99,6 @@ unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE)) \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} } \author{ -Altuna Akalin, contributions by Arsene Wabo +Altuna Akalin, contributions by Arsene Wabo and Katarzyna +Wreczycka } diff --git a/tests/testthat/test-8-methSeg.r b/tests/testthat/test-8-methSeg.r index 72a8edb5..1af1b7b2 100644 --- a/tests/testthat/test-8-methSeg.r +++ b/tests/testthat/test-8-methSeg.r @@ -25,6 +25,27 @@ test_that("check if methSeg returns error for methylBase objects" ,{ expect_error(methSeg(methylBase.obj,diagnostic.plot = FALSE)) }) +test_that("check if methSeg with cores > 1 is the same as cores=1" ,{ + a=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE) + b=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE, cores=2) + expect_equal(a,b) +}) + +test_that("check if methSeg with cores > 1 is the same as cores=1 (non-tabix file)" ,{ + a=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE) + b=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE, cores=2) + expect_equal(a,b) +}) + +methylRawDB.obj <- suppressMessages( methRead( + system.file("extdata", "control1.myCpG.txt", package = "methylKit"), + sample.id = "ctrl1", assembly = "hg18", dbtype="tabix") ) + +test_that("check if methSeg with cores > 1 is the same as cores=1 (tabix file)" ,{ + a=methSeg(methylRawDB.obj,diagnostic.plot = FALSE) + b=methSeg(methylRawDB.obj,diagnostic.plot = FALSE, cores=2) + expect_equal(a,b) +}) gr = as(methylRawList.obj[[1]],"GRanges") mcols(gr)$meth=100*gr$numCs/gr$coverage @@ -55,6 +76,12 @@ test_that("check if joining neighbours works" ,{ expect_true(all(rle(res2$seg.group)$lengths == 1)) }) +test_that("check if initialization works" ,{ + expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE),"GRanges") + expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialization=list(subset = sample(1:13,9,replace = FALSE))),"GRanges") + expect_error(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialize.on.subset = 0.6)) + expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialize.on.subset = 10),"GRanges") +}) seg <- methSeg(tileRaw,diagnostic.plot = FALSE) methSeg2bed(segments = seg, filename = "test.bed")