Skip to content

Commit cd1460b

Browse files
committed
fix: properly template-coordinate sort and duplicate mark secondary and supplementary reads
Secondary and supplementary reads must use the coordinates of the primary alignments within the template, otherwise they will not guaranteed to be next the primary alignments in the file. Therefore, we've added the "rp" and "mp" tags to store the SA-tag equivalent information for the primary alignment. This keeps information about the primary alignments with the secondary and supplementary alignments.
1 parent be1e466 commit cd1460b

File tree

5 files changed

+130
-36
lines changed

5 files changed

+130
-36
lines changed

src/main/scala/com/fulcrumgenomics/bam/Bams.scala

+45-4
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
package com.fulcrumgenomics.bam
2626

2727
import com.fulcrumgenomics.FgBioDef._
28+
import com.fulcrumgenomics.alignment.Cigar
2829
import com.fulcrumgenomics.bam.api.SamOrder.Queryname
2930
import com.fulcrumgenomics.bam.api._
3031
import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator}
@@ -41,6 +42,36 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil}
4142
import java.io.Closeable
4243
import scala.math.{max, min}
4344

45+
46+
47+
case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) {
48+
def negativeStrand: Boolean = !positiveStrand
49+
def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex
50+
51+
def end: Int = start + cigar.lengthOnTarget - 1
52+
def unclippedStart: Int = {
53+
SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar)
54+
}
55+
56+
def unclippedEnd: Int = {
57+
SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar)
58+
}
59+
}
60+
61+
object Supplementary {
62+
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
63+
def toString(rec: SamRecord): String = {
64+
val strand = if (rec.positiveStrand) '+' else '-'
65+
f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}"
66+
}
67+
68+
69+
def apply(sa: String): Supplementary = {
70+
val parts = sa.split(",")
71+
Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
72+
}
73+
}
74+
4475
/**
4576
* Class that represents all reads from a template within a BAM file.
4677
*/
@@ -107,11 +138,21 @@ case class Template(r1: Option[SamRecord],
107138

108139
/** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */
109140
def fixMateInfo(): Unit = {
110-
for (primary <- r1; supp <- r2Supplementals) {
111-
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
141+
// Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to
142+
// add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag!
143+
val r1NonPrimary = r1Supplementals ++ r1Secondaries
144+
val r2NonPrimary = r2Supplementals ++ r2Secondaries
145+
for (primary <- r1; nonPrimary <- r2NonPrimary) {
146+
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
147+
nonPrimary(SAMTag.MQ.name()) = primary.mapq
148+
nonPrimary("mp") = Supplementary.toString(primary)
149+
r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
112150
}
113-
for (primary <- r2; supp <- r1Supplementals) {
114-
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
151+
for (primary <- r2; nonPrimary <- r1NonPrimary) {
152+
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
153+
nonPrimary(SAMTag.MQ.name()) = primary.mapq
154+
nonPrimary("mp") = Supplementary.toString(primary)
155+
r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
115156
}
116157
for (first <- r1; second <- r2) {
117158
SamPairUtil.setMateInfo(first.asSam, second.asSam, true)

src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala

+47-18
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@
2424

2525
package com.fulcrumgenomics.bam.api
2626

27+
import com.fulcrumgenomics.bam.{Bams, Supplementary}
2728
import com.fulcrumgenomics.umi.ConsensusTags
2829
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
2930
import htsjdk.samtools.util.Murmur3
3031
import htsjdk.samtools.{SAMFileHeader, SAMUtils}
3132
import org.apache.commons.math3.genetics.RandomKey
3233

34+
import scala.reflect.runtime.universe.Template
35+
3336

3437
/** Trait for specifying BAM orderings. */
3538
sealed trait SamOrder extends Product {
@@ -175,24 +178,50 @@ object SamOrder {
175178
override val groupOrder: GroupOrder = GroupOrder.query
176179
override val subSort: Option[String] = Some("template-coordinate")
177180
override val sortkey: SamRecord => A = rec => {
178-
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
179-
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
180-
val readNeg = rec.negativeStrand
181-
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
182-
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
183-
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
184-
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
185-
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
186-
val index: Int = m.lastIndexOf('/')
187-
if (index >= 0) m.substring(0, index) else m
188-
}.getOrElse("")
189-
190-
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
191-
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
192-
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
193-
}
194-
else {
195-
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
181+
// For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary
182+
// alignments, use the info in the pa/pm tags.
183+
if (!rec.secondary && !rec.supplementary) {
184+
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
185+
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
186+
val readNeg = rec.negativeStrand
187+
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
188+
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
189+
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
190+
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
191+
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
192+
val index: Int = m.lastIndexOf('/')
193+
if (index >= 0) m.substring(0, index) else m
194+
}.getOrElse("")
195+
196+
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
197+
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
198+
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
199+
}
200+
else {
201+
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
202+
}
203+
} else {
204+
val primary = Supplementary(rec[String]("rp"))
205+
val mate = Supplementary(rec[String]("mp"))
206+
val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header)
207+
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header)
208+
val readNeg = primary.negativeStrand
209+
val mateNeg = if (rec.paired) mate.negativeStrand else false
210+
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart
211+
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart
212+
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
213+
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
214+
val index: Int = m.lastIndexOf('/')
215+
if (index >= 0) m.substring(0, index) else m
216+
}.getOrElse("")
217+
218+
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
219+
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
220+
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
221+
}
222+
else {
223+
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
224+
}
196225
}
197226
}
198227
}

src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,7 @@ class GroupReadsByUmi
719719

720720
// Then output the records in the right order (assigned tag, read name, r1, r2)
721721
templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => {
722-
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => {
722+
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => {
723723
out += rec
724724
})
725725
})

src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala

+13-4
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@
2525
package com.fulcrumgenomics.bam.api
2626

2727
import java.util.Random
28-
2928
import com.fulcrumgenomics.FgBioDef._
29+
import com.fulcrumgenomics.bam.Bams
30+
import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator}
3031
import com.fulcrumgenomics.commons.util.SimpleCounter
3132
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
3233
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
34+
import com.fulcrumgenomics.util.Io
3335
import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator}
3436
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
3537
import htsjdk.samtools.util.Murmur3
@@ -228,11 +230,18 @@ class SamOrderTest extends UnitSpec {
228230
s2.supplementary = true
229231
s2.properlyPaired = true
230232
exp += s2
231-
// primary read pairs for q1, that map to different contigs, but earlier that q1
233+
// primary read pairs for q2, that map to different contigs, but earlier that q1
232234
exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S")
233235

234-
val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2")
235-
val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
236+
// Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first,
237+
// so we cannot use the records in builder/exp.
238+
val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template =>
239+
template.fixMateInfo()
240+
template.allReads
241+
}.toList
242+
243+
val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup")
244+
val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
236245

237246
actual should contain theSameElementsInOrderAs expected
238247
}

src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala

+24-9
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@
2626
*/
2727
package com.fulcrumgenomics.umi
2828

29-
import com.fulcrumgenomics.bam.Template
30-
import com.fulcrumgenomics.bam.api.SamOrder
29+
import com.fulcrumgenomics.bam.{Bams, Template}
30+
import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter}
3131
import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate
3232
import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
3333
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
3434
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
3535
import com.fulcrumgenomics.umi.GroupReadsByUmi._
36+
import com.fulcrumgenomics.util.Io
3637
import org.scalatest.{OptionValues, PrivateMethodTester}
3738

3839
import java.nio.file.Files
@@ -335,29 +336,43 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
335336
}
336337

337338
it should "mark duplicates on supplementary reads" in {
338-
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate))
339+
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
339340
// primary read pairs for q1, that map to different contigs
340341
builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT"))
341342
// supplementary R2, which maps to the same chromosome as the primary R1
342343
val Seq(s1, s2) = builder.addPair("q1", contig = 1, contig2 = Some(1), start1 = 66, start2 = 66, cigar1 = "60M40S", strand2 = Minus, attrs = Map("RX" -> "ACT"))
343344
s1.supplementary = true
344345
s2.supplementary = true
345346
s2.properlyPaired = true
346-
// primary read pairs for q1, that map to different contigs, but earlier that q1
347+
// primary read pairs for q2, that map to different contigs, but earlier that q1
347348
builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
348349

349-
val in = builder.toTempFile()
350-
val out = Files.createTempFile("umi_grouped.", ".sam")
350+
// primary read pairs for q3, that are duplicates of q2
351+
builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
352+
353+
// Fix the mate information and write the input. Note: sorting here to get a template-iterator will write the
354+
// records to disk first, so we cannot use the records in builder and therefore must write to the input file
355+
// directly.
356+
val in = Files.createTempFile("raw_reads", ".sam")
357+
val writer = SamWriter(in, builder.header, sort = builder.sort)
358+
Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template =>
359+
template.fixMateInfo()
360+
writer ++= template.allReads
361+
}
362+
writer.close()
363+
364+
val out = Files.createTempFile("umi_grouped.", ".sam")
351365
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
352-
val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true))
366+
val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true)
353367

354368
gr.markDuplicates shouldBe true
355369
gr.execute()
356370

357371
val recs = readBamRecs(out)
358-
recs.length shouldBe 4
359-
recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true
372+
recs.length shouldBe 8
373+
recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true
360374
recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true
375+
recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true
361376
}
362377

363378
it should "correctly group reads with the paired assigner when the two UMIs are the same" in {

0 commit comments

Comments
 (0)