@@ -43,7 +43,6 @@ import htsjdk.samtools._
43
43
import htsjdk .samtools .util .SequenceUtil
44
44
45
45
import java .util .concurrent .atomic .AtomicLong
46
- import scala .collection .immutable .IndexedSeq
47
46
import scala .collection .mutable .ListBuffer
48
47
import scala .collection .{BufferedIterator , Iterator , mutable }
49
48
@@ -210,7 +209,7 @@ object GroupReadsByUmi {
210
209
* Class that implements the directed adjacency graph method from umi_tools.
211
210
* See: https://github.com/CGATOxford/UMI-tools
212
211
*/
213
- private [umi] class AdjacencyUmiAssigner (val maxMismatches : Int ) extends UmiAssigner {
212
+ private [umi] class AdjacencyUmiAssigner (val maxMismatches : Int , val allowUmisWithDifferentLengths : Boolean ) extends UmiAssigner {
214
213
/** Represents a node in the adjacency graph; equality is just by UMI sequence. */
215
214
class Node (val umi : Umi , val count : Long , val children : mutable.Buffer [Node ] = mutable.Buffer ()) {
216
215
/** Gets the full set of descendants from this node. */
@@ -235,16 +234,22 @@ object GroupReadsByUmi {
235
234
236
235
/** Returns whether or not a pair of UMIs match closely enough to be considered adjacent in the graph. */
237
236
protected def matches (lhs : Umi , rhs : Umi ): Boolean = {
238
- require(lhs.length == rhs.length, s " UMIs of different length detected: $lhs vs. $rhs" )
239
- var idx = 0
240
- var mismatches = 0
241
- val len = lhs.length
242
- while (idx < len && mismatches <= this .maxMismatches) {
243
- if (lhs(idx) != rhs(idx)) mismatches += 1
244
- idx += 1
245
- }
237
+ if (allowUmisWithDifferentLengths) {
238
+ lhs.length == rhs.length
239
+ } else {
240
+ require(lhs.length == rhs.length, s " UMIs of different length detected: $lhs vs. $rhs" )
241
+ true
242
+ } && {
243
+ var idx = 0
244
+ var mismatches = 0
245
+ val len = lhs.length
246
+ while (idx < len && mismatches <= this .maxMismatches) {
247
+ if (lhs(idx) != rhs(idx)) mismatches += 1
248
+ idx += 1
249
+ }
246
250
247
- mismatches <= maxMismatches
251
+ mismatches <= maxMismatches
252
+ }
248
253
}
249
254
250
255
/** Assigns IDs to each UMI based on the root to which is it mapped. */
@@ -271,7 +276,6 @@ object GroupReadsByUmi {
271
276
val nextRoot = remaining.remove(0 )
272
277
roots += nextRoot
273
278
val working = mutable.Buffer [Node ](nextRoot)
274
-
275
279
while (working.nonEmpty) {
276
280
val root = working.remove(0 )
277
281
val (hits, misses) = remaining.partition(other => root.count >= 2 * other.count - 1 && matches(root.umi, other.umi))
@@ -292,7 +296,7 @@ object GroupReadsByUmi {
292
296
*
293
297
* @param maxMismatches the maximum number of mismatches between UMIs
294
298
*/
295
- class PairedUmiAssigner (maxMismatches : Int ) extends AdjacencyUmiAssigner (maxMismatches) {
299
+ class PairedUmiAssigner (maxMismatches : Int , allowUmisWithDifferentLengths : Boolean ) extends AdjacencyUmiAssigner (maxMismatches, allowUmisWithDifferentLengths ) {
296
300
/** String that is prefixed onto the UMI from the read with that maps to a lower coordinate in the genome.. */
297
301
private [umi] val lowerReadUmiPrefix : String = (" a" * (maxMismatches+ 1 )) + " :"
298
302
@@ -402,27 +406,32 @@ case class TagFamilySizeMetric(family_size: Int,
402
406
403
407
/** The strategies implemented by [[GroupReadsByUmi ]] to identify reads from the same source molecule.*/
404
408
sealed trait Strategy extends EnumEntry {
405
- def newStrategy (edits : Int ): UmiAssigner
409
+ def newStrategy (edits : Int , allowUmisWithDifferentLengths : Boolean ): UmiAssigner
406
410
}
407
411
object Strategy extends FgBioEnum [Strategy ] {
408
412
def values : IndexedSeq [Strategy ] = findValues
409
413
/** Strategy to only reads with identical UMI sequences are grouped together. */
410
414
case object Identity extends Strategy {
411
- def newStrategy (edits : Int = 0 ): UmiAssigner = {
415
+ def newStrategy (edits : Int = 0 , allowUmisWithDifferentLengths : Boolean ): UmiAssigner = {
412
416
require(edits == 0 , " Edits should be zero when using the identity UMI assigner." )
413
417
new IdentityUmiAssigner
414
418
}
415
419
}
420
+
416
421
/** Strategy to cluster reads into groups based on mismatches between reads in clusters. */
417
- case object Edit extends Strategy { def newStrategy (edits : Int ): UmiAssigner = new SimpleErrorUmiAssigner (edits) }
422
+ case object Edit extends Strategy { def newStrategy (edits : Int , allowUmisWithDifferentLengths : Boolean ): UmiAssigner = new SimpleErrorUmiAssigner (edits) }
418
423
/** Strategy based on the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
419
424
* that allows for errors between UMIs but only when there is a count gradient.
420
425
*/
421
- case object Adjacency extends Strategy { def newStrategy (edits : Int ): UmiAssigner = new AdjacencyUmiAssigner (edits) }
426
+ case object Adjacency extends Strategy {
427
+ def newStrategy (edits : Int , allowUmisWithDifferentLengths : Boolean ): UmiAssigner = new AdjacencyUmiAssigner (edits, allowUmisWithDifferentLengths)
428
+ }
422
429
/** Strategy similar to the [[Adjacency ]] strategy similar to adjacency but for methods that produce template with a
423
430
* pair of UMIs such that a read with A-B is related to but not identical to a read with B-A.
424
431
*/
425
- case object Paired extends Strategy { def newStrategy (edits : Int ): UmiAssigner = new PairedUmiAssigner (edits)}
432
+ case object Paired extends Strategy {
433
+ def newStrategy (edits : Int , allowUmisWithDifferentLengths : Boolean ): UmiAssigner = new PairedUmiAssigner (edits, allowUmisWithDifferentLengths)
434
+ }
426
435
}
427
436
428
437
@ clp(group= ClpGroups .Umi , description =
@@ -510,6 +519,8 @@ class GroupReadsByUmi
510
519
|otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths.
511
520
|""" )
512
521
val minUmiLength : Option [Int ] = None ,
522
+ @ arg(flag= 'N' , doc= " Filter UMIs with N bases." ) val filterUmisWithN : Boolean = true ,
523
+ @ arg(flag= 'a' , doc= " Allow UMIs with different lengths" ) val allowUmisWithDifferentLengths : Boolean = false ,
513
524
@ arg(flag= 'x' , doc= """
514
525
|DEPRECATED: this option will be removed in future versions and inter-contig reads will be
515
526
|automatically processed.""" )
@@ -519,7 +530,7 @@ class GroupReadsByUmi
519
530
520
531
require(this .minUmiLength.forall(_ => this .strategy != Strategy .Paired ), " Paired strategy cannot be used with --min-umi-length" )
521
532
522
- private val assigner = strategy.newStrategy(this .edits)
533
+ private val assigner = strategy.newStrategy(this .edits, this .allowUmisWithDifferentLengths )
523
534
524
535
// Give values to unset parameters that are different in duplicate marking mode
525
536
private val _minMapQ = this .minMapQ.getOrElse(if (this .markDuplicates) 0 else 1 )
@@ -578,7 +589,9 @@ class GroupReadsByUmi
578
589
.filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1 ; false })
579
590
.filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1 ; false })
580
591
.filter(r => mapqOk(r, this ._minMapQ) || { filteredPoorAlignment += 1 ; false })
581
- .filter(r => ! r.get[String ](rawTag).exists(_.contains('N' )) || { filteredNsInUmi += 1 ; false })
592
+ .filter(
593
+ r => ! (filterUmisWithN && r.get[String ](rawTag).exists(_.contains('N' )) ) || { filteredNsInUmi += 1 ; false }
594
+ )
582
595
.filter { r =>
583
596
this .minUmiLength.forall { l =>
584
597
r.get[String ](this .rawTag).forall { umi =>
0 commit comments