Skip to content

Commit 48a75fc

Browse files
committed
Add options filterUmisWithN and allowUmisWithDifferentLengths
* filterUmisWithN defaults to true (current behavior) if false treat Ns like other bases * allowUmisWithDifferentLengths defaults to false (current behavior) if true, treat UMIs with different lengths as mismatches fix name
1 parent ff1ca67 commit 48a75fc

File tree

2 files changed

+35
-22
lines changed

2 files changed

+35
-22
lines changed

src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
7272
|calls each end of a pair independently, and does not jointly call bases that overlap within a pair. Insertion or
7373
|deletion errors in the reads are not considered in the consensus model.
7474
|
75-
|The consensus reads produced are unaligned, due to the difficulty and error-prone nature of inferring the conesensus
75+
|The consensus reads produced are unaligned, due to the difficulty and error-prone nature of inferring the consensus
7676
|alignment. Consensus reads should therefore be aligned after, which should not be too expensive as likely there
77-
|are far fewer consensus reads than input raw raws. Please see how best to use this tool within the best-practice
77+
|are far fewer consensus reads than input raw reads. Please see how best to use this tool within the best-practice
7878
|pipeline: https://github.com/fulcrumgenomics/fgbio/blob/main/docs/best-practice-consensus-pipeline.md
7979
|
8080
|Particular attention should be paid to setting the `--min-reads` parameter as this can have a dramatic effect on

src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala

+33-20
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ import htsjdk.samtools._
4343
import htsjdk.samtools.util.SequenceUtil
4444

4545
import java.util.concurrent.atomic.AtomicLong
46-
import scala.collection.immutable.IndexedSeq
4746
import scala.collection.mutable.ListBuffer
4847
import scala.collection.{BufferedIterator, Iterator, mutable}
4948

@@ -210,7 +209,7 @@ object GroupReadsByUmi {
210209
* Class that implements the directed adjacency graph method from umi_tools.
211210
* See: https://github.com/CGATOxford/UMI-tools
212211
*/
213-
private[umi] class AdjacencyUmiAssigner(val maxMismatches: Int) extends UmiAssigner {
212+
private[umi] class AdjacencyUmiAssigner(val maxMismatches: Int, val allowUmisWithDifferentLengths: Boolean) extends UmiAssigner {
214213
/** Represents a node in the adjacency graph; equality is just by UMI sequence. */
215214
class Node(val umi: Umi, val count: Long, val children: mutable.Buffer[Node] = mutable.Buffer()) {
216215
/** Gets the full set of descendants from this node. */
@@ -235,16 +234,22 @@ object GroupReadsByUmi {
235234

236235
/** Returns whether or not a pair of UMIs match closely enough to be considered adjacent in the graph. */
237236
protected def matches(lhs: Umi, rhs: Umi): Boolean = {
238-
require(lhs.length == rhs.length, s"UMIs of different length detected: $lhs vs. $rhs")
239-
var idx = 0
240-
var mismatches = 0
241-
val len = lhs.length
242-
while (idx < len && mismatches <= this.maxMismatches) {
243-
if (lhs(idx) != rhs(idx)) mismatches += 1
244-
idx += 1
245-
}
237+
if (allowUmisWithDifferentLengths) {
238+
lhs.length == rhs.length
239+
} else {
240+
require(lhs.length == rhs.length, s"UMIs of different length detected: $lhs vs. $rhs")
241+
true
242+
} && {
243+
var idx = 0
244+
var mismatches = 0
245+
val len = lhs.length
246+
while (idx < len && mismatches <= this.maxMismatches) {
247+
if (lhs(idx) != rhs(idx)) mismatches += 1
248+
idx += 1
249+
}
246250

247-
mismatches <= maxMismatches
251+
mismatches <= maxMismatches
252+
}
248253
}
249254

250255
/** Assigns IDs to each UMI based on the root to which is it mapped. */
@@ -271,7 +276,6 @@ object GroupReadsByUmi {
271276
val nextRoot = remaining.remove(0)
272277
roots += nextRoot
273278
val working = mutable.Buffer[Node](nextRoot)
274-
275279
while (working.nonEmpty) {
276280
val root = working.remove(0)
277281
val (hits, misses) = remaining.partition(other => root.count >= 2 * other.count - 1 && matches(root.umi, other.umi))
@@ -292,7 +296,7 @@ object GroupReadsByUmi {
292296
*
293297
* @param maxMismatches the maximum number of mismatches between UMIs
294298
*/
295-
class PairedUmiAssigner(maxMismatches: Int) extends AdjacencyUmiAssigner(maxMismatches) {
299+
class PairedUmiAssigner(maxMismatches: Int, allowUmisWithDifferentLengths: Boolean) extends AdjacencyUmiAssigner(maxMismatches, allowUmisWithDifferentLengths) {
296300
/** String that is prefixed onto the UMI from the read with that maps to a lower coordinate in the genome.. */
297301
private[umi] val lowerReadUmiPrefix: String = ("a" * (maxMismatches+1)) + ":"
298302

@@ -402,27 +406,32 @@ case class TagFamilySizeMetric(family_size: Int,
402406

403407
/** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/
404408
sealed trait Strategy extends EnumEntry {
405-
def newStrategy(edits: Int): UmiAssigner
409+
def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner
406410
}
407411
object Strategy extends FgBioEnum[Strategy] {
408412
def values: IndexedSeq[Strategy] = findValues
409413
/** Strategy to only reads with identical UMI sequences are grouped together. */
410414
case object Identity extends Strategy {
411-
def newStrategy(edits: Int = 0): UmiAssigner = {
415+
def newStrategy(edits: Int = 0, allowUmisWithDifferentLengths: Boolean): UmiAssigner = {
412416
require(edits == 0, "Edits should be zero when using the identity UMI assigner.")
413417
new IdentityUmiAssigner
414418
}
415419
}
420+
416421
/** Strategy to cluster reads into groups based on mismatches between reads in clusters. */
417-
case object Edit extends Strategy { def newStrategy(edits: Int): UmiAssigner = new SimpleErrorUmiAssigner(edits) }
422+
case object Edit extends Strategy { def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner = new SimpleErrorUmiAssigner(edits) }
418423
/** Strategy based on the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
419424
* that allows for errors between UMIs but only when there is a count gradient.
420425
*/
421-
case object Adjacency extends Strategy { def newStrategy(edits: Int): UmiAssigner = new AdjacencyUmiAssigner(edits) }
426+
case object Adjacency extends Strategy {
427+
def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner = new AdjacencyUmiAssigner(edits, allowUmisWithDifferentLengths)
428+
}
422429
/** Strategy similar to the [[Adjacency]] strategy similar to adjacency but for methods that produce template with a
423430
* pair of UMIs such that a read with A-B is related to but not identical to a read with B-A.
424431
*/
425-
case object Paired extends Strategy { def newStrategy(edits: Int): UmiAssigner = new PairedUmiAssigner(edits)}
432+
case object Paired extends Strategy {
433+
def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner = new PairedUmiAssigner(edits, allowUmisWithDifferentLengths)
434+
}
426435
}
427436

428437
@clp(group=ClpGroups.Umi, description =
@@ -510,6 +519,8 @@ class GroupReadsByUmi
510519
|otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths.
511520
|""")
512521
val minUmiLength: Option[Int] = None,
522+
@arg(flag='N', doc="Filter UMIs with N bases.") val filterUmisWithN: Boolean = true,
523+
@arg(flag='a', doc="Allow UMIs with different lengths") val allowUmisWithDifferentLengths: Boolean = false,
513524
@arg(flag='x', doc= """
514525
|DEPRECATED: this option will be removed in future versions and inter-contig reads will be
515526
|automatically processed.""")
@@ -519,7 +530,7 @@ class GroupReadsByUmi
519530

520531
require(this.minUmiLength.forall(_ => this.strategy != Strategy.Paired), "Paired strategy cannot be used with --min-umi-length")
521532

522-
private val assigner = strategy.newStrategy(this.edits)
533+
private val assigner = strategy.newStrategy(this.edits, this.allowUmisWithDifferentLengths)
523534

524535
// Give values to unset parameters that are different in duplicate marking mode
525536
private val _minMapQ = this.minMapQ.getOrElse(if (this.markDuplicates) 0 else 1)
@@ -578,7 +589,9 @@ class GroupReadsByUmi
578589
.filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1; false })
579590
.filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
580591
.filter(r => mapqOk(r, this._minMapQ) || { filteredPoorAlignment += 1; false })
581-
.filter(r => !r.get[String](rawTag).exists(_.contains('N')) || { filteredNsInUmi += 1; false })
592+
.filter(
593+
r => !(filterUmisWithN && r.get[String](rawTag).exists(_.contains('N')) ) || { filteredNsInUmi += 1; false }
594+
)
582595
.filter { r =>
583596
this.minUmiLength.forall { l =>
584597
r.get[String](this.rawTag).forall { umi =>

0 commit comments

Comments
 (0)