Skip to content

Commit bbfaeeb

Browse files
committed
review fixups
1 parent 1b5753c commit bbfaeeb

File tree

5 files changed

+127
-71
lines changed

5 files changed

+127
-71
lines changed

src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala

+12-2
Original file line numberDiff line numberDiff line change
@@ -208,21 +208,31 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
208208
def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases
209209

210210
/** Returns the number of bases that are hard-clipped at the start of the sequence. */
211-
def leadingHardClippedBases = this.headOption.map { elem =>
211+
def leadingHardClippedBases: Int = this.headOption.map { elem =>
212212
if (elem.operator == CigarOperator.H) elem.length else 0
213213
}.getOrElse(0)
214214

215215
/** Returns the number of bases that are clipped at the start of the sequence. */
216216
def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases
217217

218218
/** Returns the number of bases that are hard-clipped at the end of the sequence. */
219-
def trailingHardClippedBases = this.lastOption.map { elem =>
219+
def trailingHardClippedBases: Int = this.lastOption.map { elem =>
220220
if (elem.operator == CigarOperator.H) elem.length else 0
221221
}.getOrElse(0)
222222

223223
/** Returns the number of bases that are clipped at the end of the sequence. */
224224
def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases
225225

226+
/** Returns the alignment start position adjusted for leading soft and hard clipped bases. */
227+
def unclippedStart(start: Int): Int = {
228+
start - this.iterator.takeWhile(_.operator.isClipping).map(_.length).sum
229+
}
230+
231+
/** Returns the alignment end position adjusted for trailing soft and hard clipped bases. */
232+
def unclippedEnd(end: Int): Int = {
233+
end + this.reverseIterator.takeWhile(_.operator.isClipping).map(_.length).sum
234+
}
235+
226236
/** Truncates the cigar based on either query or target length cutoff. */
227237
private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = {
228238
var pos = 1

src/main/scala/com/fulcrumgenomics/bam/Bams.scala

+49-24
Original file line numberDiff line numberDiff line change
@@ -43,33 +43,54 @@ import java.io.Closeable
4343
import scala.math.{max, min}
4444

4545

46-
47-
case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) {
46+
/** Class to store information about an alignment, as described in the SAM SA tag. */
47+
private[bam] case class AlignmentInfo(refIndex: Int, start: Int, positiveStrand: Boolean, cigar: Option[Cigar], mapq: Int, nm: Int) {
4848
def negativeStrand: Boolean = !positiveStrand
49-
def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex
50-
51-
def end: Int = start + cigar.lengthOnTarget - 1
52-
def unclippedStart: Int = {
53-
SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar)
49+
def refName(header: SAMFileHeader): String = header.getSequence(refIndex).getSequenceName
50+
def mapped: Boolean = cigar.isDefined
51+
def unmapped: Boolean = !mapped
52+
private def _cigar: Cigar = cigar.getOrElse {
53+
throw new IllegalStateException(s"Cannot get cigar for AlignmentInfo: ${this}")
5454
}
55-
56-
def unclippedEnd: Int = {
57-
SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar)
55+
def end: Int = start + _cigar.lengthOnTarget - 1
56+
def unclippedStart: Int = _cigar.unclippedStart(start)
57+
def unclippedEnd: Int = _cigar.unclippedEnd(end)
58+
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
59+
def toSA(header: SAMFileHeader): String = {
60+
val strand = if (positiveStrand) '+' else '-'
61+
val refName = header.getSequence(refIndex).getSequenceName
62+
val cigar = this.cigar.getOrElse("*")
63+
f"${refName},${start},${strand},${cigar},${mapq},${nm}"
5864
}
5965
}
6066

61-
object Supplementary {
62-
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
63-
def toString(rec: SamRecord): String = {
64-
val strand = if (rec.positiveStrand) '+' else '-'
65-
f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}"
67+
private[bam] object AlignmentInfo {
68+
def apply(rec: SamRecord, mate: Boolean = false): AlignmentInfo = {
69+
if (mate) {
70+
val mateRefIndex = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
71+
val mateCigar = if (rec.unpaired || rec.mateUnmapped) None else Some(rec.mateCigar.getOrElse {
72+
throw new IllegalStateException(s"Mate CIGAR (Tag 'MC') not found for $rec, consider using SetMateInformation.")
73+
})
74+
// NB: mateCigar has already checked for the existence of the MC tag, so using .get here is fine
75+
val mateStart = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateStart
76+
val mateStrand = if (rec.unpaired || rec.mateUnmapped) true else rec.matePositiveStrand
77+
AlignmentInfo(mateRefIndex, mateStart, mateStrand, mateCigar, rec.mapq, 0)
78+
} else {
79+
val refIndex = if (rec.unmapped) Int.MaxValue else rec.refIndex
80+
val positiveStrand = rec.positiveStrand
81+
val start = if (rec.unmapped) Int.MaxValue else rec.start
82+
AlignmentInfo(refIndex, start, positiveStrand, Some(rec.cigar), rec.mapq, rec.getOrElse(SAMTag.NM.name(), 0))
83+
}
6684
}
6785

68-
69-
def apply(sa: String): Supplementary = {
70-
val parts = sa.split(",")
71-
Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
86+
def apply(sa: String, header: SAMFileHeader): AlignmentInfo = {
87+
val parts = sa.split(",")
88+
require(parts.length == 6, f"Could not parse SA tag: ${sa}")
89+
val refIndex = header.getSequenceIndex(parts(0))
90+
AlignmentInfo(refIndex, parts(1).toInt, parts(2) == "+", Some(Cigar(parts(3))), parts(4).toInt, parts(5).toInt)
7291
}
92+
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
93+
def toSA(rec: SamRecord): String = AlignmentInfo(rec).toSA(rec.header)
7394
}
7495

7596
/**
@@ -136,7 +157,7 @@ case class Template(r1: Option[SamRecord],
136157
Template(x1, x2)
137158
}
138159

139-
/** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */
160+
/** Fixes mate information and sets mate cigar on all primary, secondary, and supplementary records. */
140161
def fixMateInfo(): Unit = {
141162
// Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to
142163
// add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag!
@@ -145,14 +166,14 @@ case class Template(r1: Option[SamRecord],
145166
for (primary <- r1; nonPrimary <- r2NonPrimary) {
146167
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
147168
nonPrimary(SAMTag.MQ.name()) = primary.mapq
148-
nonPrimary("mp") = Supplementary.toString(primary)
149-
r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
169+
nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary)
170+
r2.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r))
150171
}
151172
for (primary <- r2; nonPrimary <- r1NonPrimary) {
152173
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
153174
nonPrimary(SAMTag.MQ.name()) = primary.mapq
154-
nonPrimary("mp") = Supplementary.toString(primary)
155-
r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
175+
nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary)
176+
r1.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r))
156177
}
157178
for (first <- r1; second <- r2) {
158179
SamPairUtil.setMateInfo(first.asSam, second.asSam, true)
@@ -164,6 +185,10 @@ case class Template(r1: Option[SamRecord],
164185
}
165186

166187
object Template {
188+
/** The local SAM tag to store the alignment information of the primary alignment (in the same format as the SA tag) */
189+
val ReadPrimarySamTag: String = "rp"
190+
/** The local SAM tag to store the alignment information of the mate's primary alignment (in the same format as the SA tag) */
191+
val MatePrimarySamTag: String = "mp"
167192
/**
168193
* Generates a Template for the next template in the buffered iterator. Assumes that the
169194
* iterator is queryname sorted or grouped.

src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala

+30-44
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,13 @@
2424

2525
package com.fulcrumgenomics.bam.api
2626

27-
import com.fulcrumgenomics.bam.{Bams, Supplementary}
27+
import com.fulcrumgenomics.bam.{Bams, AlignmentInfo, Template}
2828
import com.fulcrumgenomics.umi.ConsensusTags
2929
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
3030
import htsjdk.samtools.util.Murmur3
3131
import htsjdk.samtools.{SAMFileHeader, SAMUtils}
3232
import org.apache.commons.math3.genetics.RandomKey
3333

34-
import scala.reflect.runtime.universe.Template
35-
3634

3735
/** Trait for specifying BAM orderings. */
3836
sealed trait SamOrder extends Product {
@@ -178,50 +176,38 @@ object SamOrder {
178176
override val groupOrder: GroupOrder = GroupOrder.query
179177
override val subSort: Option[String] = Some("template-coordinate")
180178
override val sortkey: SamRecord => A = rec => {
181-
// For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary
182-
// alignments, use the info in the pa/pm tags.
183-
if (!rec.secondary && !rec.supplementary) {
184-
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
185-
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
186-
val readNeg = rec.negativeStrand
187-
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
188-
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
189-
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
190-
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
191-
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
192-
val index: Int = m.lastIndexOf('/')
193-
if (index >= 0) m.substring(0, index) else m
194-
}.getOrElse("")
195-
196-
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
197-
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
198-
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
179+
// For non-secondary/non-supplementary alignments, or unpaired or mate unmapped reads, use the info in the record.
180+
// Otherwise, use the info in the pa/pm tags.
181+
val primary = if (!rec.secondary && !rec.supplementary) AlignmentInfo(rec) else {
182+
val readPrimaryTag = rec.get[String](Template.ReadPrimarySamTag).getOrElse {
183+
throw new IllegalStateException(
184+
s"Missing '${Template.ReadPrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation"
185+
)
199186
}
200-
else {
201-
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
187+
AlignmentInfo(readPrimaryTag, rec.header)
188+
}
189+
val mate = if ((!rec.secondary && !rec.supplementary) || rec.unpaired || rec.mateUnmapped) AlignmentInfo(rec, mate=true) else {
190+
val matePrimaryTag = rec.get[String](Template.MatePrimarySamTag).getOrElse {
191+
throw new IllegalStateException(
192+
s"Missing '${Template.MatePrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation"
193+
)
202194
}
203-
} else {
204-
val primary = Supplementary(rec[String]("rp"))
205-
val mate = Supplementary(rec[String]("mp"))
206-
val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header)
207-
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header)
208-
val readNeg = primary.negativeStrand
209-
val mateNeg = if (rec.paired) mate.negativeStrand else false
210-
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart
211-
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart
212-
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
213-
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
214-
val index: Int = m.lastIndexOf('/')
215-
if (index >= 0) m.substring(0, index) else m
216-
}.getOrElse("")
195+
AlignmentInfo(matePrimaryTag, rec.header)
196+
}
197+
val readPos = if (primary.unmapped) Int.MaxValue else if (primary.negativeStrand) primary.unclippedEnd else primary.unclippedStart
198+
val matePos = if (mate.unmapped) Int.MaxValue else if (mate.negativeStrand) mate.unclippedEnd else mate.unclippedStart
199+
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
200+
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
201+
val index: Int = m.lastIndexOf('/')
202+
if (index >= 0) m.substring(0, index) else m
203+
}.getOrElse("")
217204

218-
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
219-
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
220-
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
221-
}
222-
else {
223-
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
224-
}
205+
if (primary.refIndex < mate.refIndex || (primary.refIndex == mate.refIndex && readPos < matePos) ||
206+
(primary.refIndex == mate.refIndex && readPos == matePos && primary.positiveStrand)) {
207+
TemplateCoordinateKey(primary.refIndex, mate.refIndex, readPos, matePos, primary.negativeStrand, mate.negativeStrand, lib, mid, rec.name, false)
208+
}
209+
else {
210+
TemplateCoordinateKey(mate.refIndex, primary.refIndex, matePos, readPos, mate.negativeStrand, primary.negativeStrand, lib, mid, rec.name, true)
225211
}
226212
}
227213
}

src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class TransientAttrs(private val rec: SamRecord) {
7676

7777
/**
7878
* A trait that fgbio uses as a replacement for [[SAMRecord]]. The trait is self-typed as a
79-
* [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this wasy so that
79+
* [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this ways so that
8080
* a) we can access superclass methods via [[SamRecordIntermediate]] but that self-typing here
8181
* instead of extending hides the [[SAMRecord]] API from users of the class. The result is
8282
* always a [[SAMRecord]] but isn't seen as such without casting.

src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala

+35
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,41 @@ class AlignmentTest extends UnitSpec {
121121
Cigar("10M2D20M").reverse.toString shouldBe "20M2D10M"
122122
}
123123

124+
private case class ClippingTestCase
125+
(cigar: String, clippedBases: Int,
126+
leadingSoftClippedBases: Int, trailingSoftClippedBases: Int,
127+
leadingHardClippedBases: Int, trailingHardClippedBases: Int,
128+
leadingClippedBases: Int, trailingClippedBases: Int,
129+
unclippedStart: Int, unclippedEnd: Int)
130+
131+
Seq(
132+
ClippingTestCase("50M", 0, 0, 0, 0, 0, 0, 0, 0, 0), // no clipping
133+
ClippingTestCase("20S50M", 20, 20, 0, 0, 0, 20, 0, -20, 0), // leading soft clipping
134+
ClippingTestCase("50M10S", 10, 0, 10, 0, 0, 0, 10, 0, 10), // trailing soft clipping
135+
ClippingTestCase("20S50M10S", 30, 20, 10, 0, 0, 20, 10, -20, 10), // leading and trailing soft clipping
136+
ClippingTestCase("20H50M", 20, 0, 0, 20, 0, 20, 0, -20, 0), // leading hard clipping
137+
ClippingTestCase("50M10H", 10, 0, 0, 0, 10, 0, 10, 0, 10), // trailing hard clipping
138+
ClippingTestCase("20H50M10H", 30, 0, 0, 20, 10, 20, 10, -20, 10), // leading and trailing hard clipping
139+
ClippingTestCase("15H5S50M", 20, 5, 0, 15, 0, 20, 0, -20, 0), // leading hard-the-soft clipping
140+
ClippingTestCase("50M5S15H", 20, 0, 5, 0, 15, 0, 20, 0, 20), // trailing hard-the-soft clipping
141+
ClippingTestCase("20H15S50M10S5H", 50, 15, 10, 20, 5, 35, 15, -35, 15), // leading and trailing hard-the-soft clipping
142+
).foreach { testCase =>
143+
"Cigar clipping methods" should s"account for leading and trailing clipping in ${testCase.cigar}" in {
144+
val cigar = Cigar(testCase.cigar)
145+
cigar.clippedBases shouldBe testCase.clippedBases
146+
cigar.leadingSoftClippedBases shouldBe testCase.leadingSoftClippedBases
147+
cigar.trailingSoftClippedBases shouldBe testCase.trailingSoftClippedBases
148+
cigar.leadingHardClippedBases shouldBe testCase.leadingHardClippedBases
149+
cigar.trailingHardClippedBases shouldBe testCase.trailingHardClippedBases
150+
cigar.leadingClippedBases shouldBe testCase.leadingClippedBases
151+
cigar.trailingClippedBases shouldBe testCase.trailingClippedBases
152+
cigar.unclippedStart(100) shouldBe testCase.unclippedStart + 100
153+
cigar.unclippedEnd(200) shouldBe testCase.unclippedEnd + 200
154+
}
155+
156+
}
157+
158+
124159
/////////////////////////////////////////////////////////////////////////////
125160
// Tests for the Alignment class
126161
/////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)