Skip to content

Commit 75ac4bd

Browse files
committed
fix: properly template-coordinate sort and duplicate mark secondary and supplementary reads
Secondary and supplementary reads must use the coordinates of the primary alignments within the template, otherwise they will not guaranteed to be next the primary alignments in the file. Therefore, we've added the "rp" and "mp" tags to store the SA-tag equivalent information for the primary alignment. This keeps information about the primary alignments with the secondary and supplementary alignments.
1 parent 8637898 commit 75ac4bd

File tree

5 files changed

+132
-38
lines changed

5 files changed

+132
-38
lines changed

src/main/scala/com/fulcrumgenomics/bam/Bams.scala

+47-6
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
package com.fulcrumgenomics.bam
2626

2727
import com.fulcrumgenomics.FgBioDef._
28+
import com.fulcrumgenomics.alignment.Cigar
2829
import com.fulcrumgenomics.bam.api.SamOrder.Queryname
2930
import com.fulcrumgenomics.bam.api._
3031
import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator}
@@ -41,6 +42,36 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil}
4142
import java.io.Closeable
4243
import scala.math.{max, min}
4344

45+
46+
47+
case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) {
48+
def negativeStrand: Boolean = !positiveStrand
49+
def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex
50+
51+
def end: Int = start + cigar.lengthOnTarget - 1
52+
def unclippedStart: Int = {
53+
SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar)
54+
}
55+
56+
def unclippedEnd: Int = {
57+
SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar)
58+
}
59+
}
60+
61+
object Supplementary {
62+
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
63+
def toString(rec: SamRecord): String = {
64+
val strand = if (rec.positiveStrand) '+' else '-'
65+
f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}"
66+
}
67+
68+
69+
def apply(sa: String): Supplementary = {
70+
val parts = sa.split(",")
71+
Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
72+
}
73+
}
74+
4475
/**
4576
* Class that represents all reads from a template within a BAM file.
4677
*/
@@ -108,13 +139,23 @@ case class Template(r1: Option[SamRecord],
108139
/** Fixes mate information and sets mate cigar and mate score on all primary and supplementary (but not secondary) records. */
109140
def fixMateInfo(): Unit = {
110141
// Developer note: the mate score ("ms") tag is used by samtools markdup
111-
for (primary <- r1; supp <- r2Supplementals) {
112-
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
113-
primary.get[Int]("AS").foreach(supp("ms") = _)
142+
// Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to
143+
// add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag!
144+
val r1NonPrimary = r1Supplementals ++ r1Secondaries
145+
val r2NonPrimary = r2Supplementals ++ r2Secondaries
146+
for (primary <- r1; nonPrimary <- r2NonPrimary) {
147+
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
148+
primary.get[Int]("AS").foreach(nonPrimary("ms") = _)
149+
nonPrimary(SAMTag.MQ.name()) = primary.mapq
150+
nonPrimary("mp") = Supplementary.toString(primary)
151+
r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
114152
}
115-
for (primary <- r2; supp <- r1Supplementals) {
116-
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
117-
primary.get[Int]("AS").foreach(supp("ms") = _)
153+
for (primary <- r2; nonPrimary <- r1NonPrimary) {
154+
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
155+
primary.get[Int]("AS").foreach(nonPrimary("ms") = _)
156+
nonPrimary(SAMTag.MQ.name()) = primary.mapq
157+
nonPrimary("mp") = Supplementary.toString(primary)
158+
r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
118159
}
119160
for (first <- r1; second <- r2) {
120161
SamPairUtil.setMateInfo(first.asSam, second.asSam, true)

src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala

+47-18
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@
2424

2525
package com.fulcrumgenomics.bam.api
2626

27+
import com.fulcrumgenomics.bam.{Bams, Supplementary}
2728
import com.fulcrumgenomics.umi.ConsensusTags
2829
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
2930
import htsjdk.samtools.util.Murmur3
3031
import htsjdk.samtools.{SAMFileHeader, SAMUtils}
3132
import org.apache.commons.math3.genetics.RandomKey
3233

34+
import scala.reflect.runtime.universe.Template
35+
3336

3437
/** Trait for specifying BAM orderings. */
3538
sealed trait SamOrder extends Product {
@@ -175,24 +178,50 @@ object SamOrder {
175178
override val groupOrder: GroupOrder = GroupOrder.query
176179
override val subSort: Option[String] = Some("template-coordinate")
177180
override val sortkey: SamRecord => A = rec => {
178-
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
179-
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
180-
val readNeg = rec.negativeStrand
181-
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
182-
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
183-
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
184-
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
185-
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
186-
val index: Int = m.lastIndexOf('/')
187-
if (index >= 0) m.substring(0, index) else m
188-
}.getOrElse("")
189-
190-
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
191-
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
192-
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
193-
}
194-
else {
195-
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
181+
// For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary
182+
// alignments, use the info in the pa/pm tags.
183+
if (!rec.secondary && !rec.supplementary) {
184+
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
185+
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
186+
val readNeg = rec.negativeStrand
187+
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
188+
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
189+
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
190+
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
191+
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
192+
val index: Int = m.lastIndexOf('/')
193+
if (index >= 0) m.substring(0, index) else m
194+
}.getOrElse("")
195+
196+
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
197+
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
198+
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
199+
}
200+
else {
201+
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
202+
}
203+
} else {
204+
val primary = Supplementary(rec[String]("rp"))
205+
val mate = Supplementary(rec[String]("mp"))
206+
val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header)
207+
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header)
208+
val readNeg = primary.negativeStrand
209+
val mateNeg = if (rec.paired) mate.negativeStrand else false
210+
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart
211+
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart
212+
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
213+
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
214+
val index: Int = m.lastIndexOf('/')
215+
if (index >= 0) m.substring(0, index) else m
216+
}.getOrElse("")
217+
218+
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
219+
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
220+
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
221+
}
222+
else {
223+
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
224+
}
196225
}
197226
}
198227
}

src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ class GroupReadsByUmi
734734

735735
// Then output the records in the right order (assigned tag, read name, r1, r2)
736736
templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => {
737-
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => {
737+
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => {
738738
out += rec
739739
})
740740
})

src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala

+13-4
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@
2525
package com.fulcrumgenomics.bam.api
2626

2727
import java.util.Random
28-
2928
import com.fulcrumgenomics.FgBioDef._
29+
import com.fulcrumgenomics.bam.Bams
30+
import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator}
3031
import com.fulcrumgenomics.commons.util.SimpleCounter
3132
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
3233
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
34+
import com.fulcrumgenomics.util.Io
3335
import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator}
3436
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
3537
import htsjdk.samtools.util.Murmur3
@@ -228,11 +230,18 @@ class SamOrderTest extends UnitSpec {
228230
s2.supplementary = true
229231
s2.properlyPaired = true
230232
exp += s2
231-
// primary read pairs for q1, that map to different contigs, but earlier that q1
233+
// primary read pairs for q2, that map to different contigs, but earlier that q1
232234
exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S")
233235

234-
val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2")
235-
val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
236+
// Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first,
237+
// so we cannot use the records in builder/exp.
238+
val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template =>
239+
template.fixMateInfo()
240+
template.allReads
241+
}.toList
242+
243+
val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup")
244+
val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
236245

237246
actual should contain theSameElementsInOrderAs expected
238247
}

src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala

+24-9
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,15 @@
2626
*/
2727
package com.fulcrumgenomics.umi
2828

29-
import com.fulcrumgenomics.bam.Template
30-
import com.fulcrumgenomics.bam.api.SamOrder
29+
import com.fulcrumgenomics.bam.{Bams, Template}
30+
import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter}
3131
import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate
3232
import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
3333
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
3434
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
3535
import com.fulcrumgenomics.umi.GroupReadsByUmi._
3636
import com.fulcrumgenomics.util.Metric
37+
import com.fulcrumgenomics.util.Io
3738
import org.scalatest.{OptionValues, PrivateMethodTester}
3839

3940
import java.nio.file.Files
@@ -345,29 +346,43 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
345346
}
346347

347348
it should "mark duplicates on supplementary reads" in {
348-
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate))
349+
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
349350
// primary read pairs for q1, that map to different contigs
350351
builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT"))
351352
// supplementary R2, which maps to the same chromosome as the primary R1
352353
val Seq(s1, s2) = builder.addPair("q1", contig = 1, contig2 = Some(1), start1 = 66, start2 = 66, cigar1 = "60M40S", strand2 = Minus, attrs = Map("RX" -> "ACT"))
353354
s1.supplementary = true
354355
s2.supplementary = true
355356
s2.properlyPaired = true
356-
// primary read pairs for q1, that map to different contigs, but earlier that q1
357+
// primary read pairs for q2, that map to different contigs, but earlier that q1
357358
builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
358359

359-
val in = builder.toTempFile()
360-
val out = Files.createTempFile("umi_grouped.", ".sam")
360+
// primary read pairs for q3, that are duplicates of q2
361+
builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
362+
363+
// Fix the mate information and write the input. Note: sorting here to get a template-iterator will write the
364+
// records to disk first, so we cannot use the records in builder and therefore must write to the input file
365+
// directly.
366+
val in = Files.createTempFile("raw_reads", ".sam")
367+
val writer = SamWriter(in, builder.header, sort = builder.sort)
368+
Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template =>
369+
template.fixMateInfo()
370+
writer ++= template.allReads
371+
}
372+
writer.close()
373+
374+
val out = Files.createTempFile("umi_grouped.", ".sam")
361375
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
362-
val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true))
376+
val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true)
363377

364378
gr.markDuplicates shouldBe true
365379
gr.execute()
366380

367381
val recs = readBamRecs(out)
368-
recs.length shouldBe 4
369-
recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true
382+
recs.length shouldBe 8
383+
recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true
370384
recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true
385+
recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true
371386
}
372387

373388
it should "correctly group reads with the paired assigner when the two UMIs are the same" in {

0 commit comments

Comments
 (0)