Skip to content

Commit 36c78a6

Browse files
committed
review fixups
1 parent 1b5753c commit 36c78a6

File tree

5 files changed

+121
-69
lines changed

5 files changed

+121
-69
lines changed

src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala

+12-2
Original file line numberDiff line numberDiff line change
@@ -208,21 +208,31 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
208208
def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases
209209

210210
/** Returns the number of bases that are hard-clipped at the start of the sequence. */
211-
def leadingHardClippedBases = this.headOption.map { elem =>
211+
def leadingHardClippedBases: Int = this.headOption.map { elem =>
212212
if (elem.operator == CigarOperator.H) elem.length else 0
213213
}.getOrElse(0)
214214

215215
/** Returns the number of bases that are clipped at the start of the sequence. */
216216
def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases
217217

218218
/** Returns the number of bases that are hard-clipped at the end of the sequence. */
219-
def trailingHardClippedBases = this.lastOption.map { elem =>
219+
def trailingHardClippedBases: Int = this.lastOption.map { elem =>
220220
if (elem.operator == CigarOperator.H) elem.length else 0
221221
}.getOrElse(0)
222222

223223
/** Returns the number of bases that are clipped at the end of the sequence. */
224224
def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases
225225

226+
/** Returns the alignment start position adjusted for leading soft and hard clipped bases. */
227+
def unclippedStart(start: Int): Int = {
228+
start - this.iterator.takeWhile(_.operator.isClipping).map(_.length).sum
229+
}
230+
231+
/** Returns the alignment end position adjusted for trailing soft and hard clipped bases. */
232+
def unclippedEnd(end: Int): Int = {
233+
end + this.reverseIterator.takeWhile(_.operator.isClipping).map(_.length).sum
234+
}
235+
226236
/** Truncates the cigar based on either query or target length cutoff. */
227237
private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = {
228238
var pos = 1

src/main/scala/com/fulcrumgenomics/bam/Bams.scala

+47-24
Original file line numberDiff line numberDiff line change
@@ -43,33 +43,52 @@ import java.io.Closeable
4343
import scala.math.{max, min}
4444

4545

46-
47-
case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) {
46+
/** Class to store information about an alignment, as described in the SAM SA tag. */
47+
private[bam] case class AlignmentInfo(refIndex: Int, start: Int, positiveStrand: Boolean, cigar: Option[Cigar], mapq: Int, nm: Int) {
4848
def negativeStrand: Boolean = !positiveStrand
49-
def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex
50-
51-
def end: Int = start + cigar.lengthOnTarget - 1
52-
def unclippedStart: Int = {
53-
SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar)
49+
def refName(header: SAMFileHeader): String = header.getSequence(refIndex).getSequenceName
50+
private def _cigar: Cigar = cigar.getOrElse {
51+
throw new IllegalStateException(s"Cannot get cigar for AlignmentInfo: ${this}")
5452
}
55-
56-
def unclippedEnd: Int = {
57-
SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar)
53+
def end: Int = start + _cigar.lengthOnTarget - 1
54+
def unclippedStart: Int = _cigar.unclippedStart(start)
55+
def unclippedEnd: Int = _cigar.unclippedEnd(end)
56+
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
57+
def toSA(header: SAMFileHeader): String = {
58+
val strand = if (positiveStrand) '+' else '-'
59+
val refName = header.getSequence(refIndex).getSequenceName
60+
val cigar = this.cigar.getOrElse("*")
61+
f"${refName},${start},${strand},${cigar},${mapq},${nm}"
5862
}
5963
}
6064

61-
object Supplementary {
62-
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
63-
def toString(rec: SamRecord): String = {
64-
val strand = if (rec.positiveStrand) '+' else '-'
65-
f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}"
65+
private[bam] object AlignmentInfo {
66+
def apply(rec: SamRecord, mate: Boolean = false): AlignmentInfo = {
67+
if (mate) {
68+
val mateRefIndex = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
69+
val mateCigar = if (rec.unpaired || rec.mateUnmapped) None else Some(rec.mateCigar.getOrElse {
70+
throw new IllegalStateException(s"Mate CIGAR (Tag 'MC') not found for $rec, consider using SetMateInformation.")
71+
})
72+
// NB: mateCigar has already checked for the existence of the MC tag, so using .get here is fine
73+
val mateStart = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (rec.mateNegativeStrand) rec.mateUnclippedEnd.get else rec.mateUnclippedStart.get
74+
val mateStrand = if (rec.unpaired || rec.mateUnmapped) true else rec.matePositiveStrand
75+
AlignmentInfo(mateRefIndex, mateStart, mateStrand, mateCigar, rec.mapq, 0)
76+
} else {
77+
val refIndex = if (rec.unmapped) Int.MaxValue else rec.refIndex
78+
val positiveStrand = rec.positiveStrand
79+
val start = if (rec.unmapped) Int.MaxValue else if (rec.negativeStrand) rec.unclippedEnd else rec.unclippedStart
80+
AlignmentInfo(refIndex, start, positiveStrand, Some(rec.cigar), rec.mapq, rec.getOrElse(SAMTag.NM.name(), 0))
81+
}
6682
}
6783

68-
69-
def apply(sa: String): Supplementary = {
70-
val parts = sa.split(",")
71-
Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
84+
def apply(sa: String, header: SAMFileHeader): AlignmentInfo = {
85+
val parts = sa.split(",")
86+
require(parts.length == 6, f"Could not parse SA tag: ${sa}")
87+
val refIndex = header.getSequenceIndex(parts(0))
88+
AlignmentInfo(refIndex, parts(1).toInt, parts(2) == "+", Some(Cigar(parts(3))), parts(4).toInt, parts(5).toInt)
7289
}
90+
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
91+
def toSA(rec: SamRecord): String = AlignmentInfo(rec).toSA(rec.header)
7392
}
7493

7594
/**
@@ -136,7 +155,7 @@ case class Template(r1: Option[SamRecord],
136155
Template(x1, x2)
137156
}
138157

139-
/** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */
158+
/** Fixes mate information and sets mate cigar on all primary, secondary, and supplementary records. */
140159
def fixMateInfo(): Unit = {
141160
// Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to
142161
// add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag!
@@ -145,14 +164,14 @@ case class Template(r1: Option[SamRecord],
145164
for (primary <- r1; nonPrimary <- r2NonPrimary) {
146165
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
147166
nonPrimary(SAMTag.MQ.name()) = primary.mapq
148-
nonPrimary("mp") = Supplementary.toString(primary)
149-
r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
167+
nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary)
168+
r2.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r))
150169
}
151170
for (primary <- r2; nonPrimary <- r1NonPrimary) {
152171
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
153172
nonPrimary(SAMTag.MQ.name()) = primary.mapq
154-
nonPrimary("mp") = Supplementary.toString(primary)
155-
r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
173+
nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary)
174+
r1.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r))
156175
}
157176
for (first <- r1; second <- r2) {
158177
SamPairUtil.setMateInfo(first.asSam, second.asSam, true)
@@ -164,6 +183,10 @@ case class Template(r1: Option[SamRecord],
164183
}
165184

166185
object Template {
186+
/** The local SAM tag to store the alignment information of the primary alignment (in the same format as the SA tag) */
187+
val ReadPrimarySamTag: String = "rp"
188+
/** The local SAM tag to store the alignment information of the mate's primary alignment (in the same format as the SA tag) */
189+
val MatePrimarySamTag: String = "mp"
167190
/**
168191
* Generates a Template for the next template in the buffered iterator. Assumes that the
169192
* iterator is queryname sorted or grouped.

src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala

+26-42
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,13 @@
2424

2525
package com.fulcrumgenomics.bam.api
2626

27-
import com.fulcrumgenomics.bam.{Bams, Supplementary}
27+
import com.fulcrumgenomics.bam.{Bams, AlignmentInfo, Template}
2828
import com.fulcrumgenomics.umi.ConsensusTags
2929
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
3030
import htsjdk.samtools.util.Murmur3
3131
import htsjdk.samtools.{SAMFileHeader, SAMUtils}
3232
import org.apache.commons.math3.genetics.RandomKey
3333

34-
import scala.reflect.runtime.universe.Template
35-
3634

3735
/** Trait for specifying BAM orderings. */
3836
sealed trait SamOrder extends Product {
@@ -180,48 +178,34 @@ object SamOrder {
180178
override val sortkey: SamRecord => A = rec => {
181179
// For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary
182180
// alignments, use the info in the pa/pm tags.
183-
if (!rec.secondary && !rec.supplementary) {
184-
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
185-
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
186-
val readNeg = rec.negativeStrand
187-
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
188-
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
189-
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
190-
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
191-
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
192-
val index: Int = m.lastIndexOf('/')
193-
if (index >= 0) m.substring(0, index) else m
194-
}.getOrElse("")
195-
196-
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
197-
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
198-
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
199-
}
181+
val (primary, mate) = {
182+
if (!rec.secondary && !rec.supplementary) (AlignmentInfo(rec), AlignmentInfo(rec, mate=true))
200183
else {
201-
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
184+
val readPrimaryTag = rec.get[String](Template.ReadPrimarySamTag).getOrElse {
185+
throw new IllegalStateException(
186+
s"Missing '${Template.ReadPrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation"
187+
)
188+
}
189+
val matePrimaryTag = rec.get[String](Template.MatePrimarySamTag).getOrElse {
190+
throw new IllegalStateException(
191+
s"Missing '${Template.MatePrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation"
192+
)
193+
}
194+
(AlignmentInfo(readPrimaryTag, rec.header), AlignmentInfo(matePrimaryTag, rec.header))
202195
}
203-
} else {
204-
val primary = Supplementary(rec[String]("rp"))
205-
val mate = Supplementary(rec[String]("mp"))
206-
val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header)
207-
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header)
208-
val readNeg = primary.negativeStrand
209-
val mateNeg = if (rec.paired) mate.negativeStrand else false
210-
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart
211-
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart
212-
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
213-
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
214-
val index: Int = m.lastIndexOf('/')
215-
if (index >= 0) m.substring(0, index) else m
216-
}.getOrElse("")
196+
}
197+
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
198+
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
199+
val index: Int = m.lastIndexOf('/')
200+
if (index >= 0) m.substring(0, index) else m
201+
}.getOrElse("")
217202

218-
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
219-
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
220-
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
221-
}
222-
else {
223-
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
224-
}
203+
if (primary.refIndex < mate.refIndex || (primary.refIndex == mate.refIndex && primary.start < mate.start) ||
204+
(primary.refIndex == mate.refIndex && primary.start == mate.start && primary.positiveStrand)) {
205+
TemplateCoordinateKey(primary.refIndex, mate.refIndex, primary.start, mate.start, primary.negativeStrand, mate.negativeStrand, lib, mid, rec.name, false)
206+
}
207+
else {
208+
TemplateCoordinateKey(mate.refIndex, primary.refIndex, mate.start, primary.start, mate.negativeStrand, primary.negativeStrand, lib, mid, rec.name, true)
225209
}
226210
}
227211
}

src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class TransientAttrs(private val rec: SamRecord) {
7676

7777
/**
7878
* A trait that fgbio uses as a replacement for [[SAMRecord]]. The trait is self-typed as a
79-
* [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this wasy so that
79+
* [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this ways so that
8080
* a) we can access superclass methods via [[SamRecordIntermediate]] but that self-typing here
8181
* instead of extending hides the [[SAMRecord]] API from users of the class. The result is
8282
* always a [[SAMRecord]] but isn't seen as such without casting.

src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala

+35
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,41 @@ class AlignmentTest extends UnitSpec {
121121
Cigar("10M2D20M").reverse.toString shouldBe "20M2D10M"
122122
}
123123

124+
private case class ClippingTestCase
125+
(cigar: String, clippedBases: Int,
126+
leadingSoftClippedBases: Int, trailingSoftClippedBases: Int,
127+
leadingHardClippedBases: Int, trailingHardClippedBases: Int,
128+
leadingClippedBases: Int, trailingClippedBases: Int,
129+
unclippedStart: Int, unclippedEnd: Int)
130+
131+
Seq(
132+
ClippingTestCase("50M", 0, 0, 0, 0, 0, 0, 0, 0, 0), // no clipping
133+
ClippingTestCase("20S50M", 20, 20, 0, 0, 0, 20, 0, -20, 0), // leading soft clipping
134+
ClippingTestCase("50M10S", 10, 0, 10, 0, 0, 0, 10, 0, 10), // trailing soft clipping
135+
ClippingTestCase("20S50M10S", 30, 20, 10, 0, 0, 20, 10, -20, 10), // leading and trailing soft clipping
136+
ClippingTestCase("20H50M", 20, 0, 0, 20, 0, 20, 0, -20, 0), // leading hard clipping
137+
ClippingTestCase("50M10H", 10, 0, 0, 0, 10, 0, 10, 0, 10), // trailing hard clipping
138+
ClippingTestCase("20H50M10H", 30, 0, 0, 20, 10, 20, 10, -20, 10), // leading and trailing hard clipping
139+
ClippingTestCase("15H5S50M", 20, 5, 0, 15, 0, 20, 0, -20, 0), // leading hard-the-soft clipping
140+
ClippingTestCase("50M5S15H", 20, 0, 5, 0, 15, 0, 20, 0, 20), // trailing hard-the-soft clipping
141+
ClippingTestCase("20H15S50M10S5H", 50, 15, 10, 20, 5, 35, 15, -35, 15), // leading and trailing hard-the-soft clipping
142+
).foreach { testCase =>
143+
"Cigar clipping methods" should s"account for leading and trailing clipping in ${testCase.cigar}" in {
144+
val cigar = Cigar(testCase.cigar)
145+
cigar.clippedBases shouldBe testCase.clippedBases
146+
cigar.leadingSoftClippedBases shouldBe testCase.leadingSoftClippedBases
147+
cigar.trailingSoftClippedBases shouldBe testCase.trailingSoftClippedBases
148+
cigar.leadingHardClippedBases shouldBe testCase.leadingHardClippedBases
149+
cigar.trailingHardClippedBases shouldBe testCase.trailingHardClippedBases
150+
cigar.leadingClippedBases shouldBe testCase.leadingClippedBases
151+
cigar.trailingClippedBases shouldBe testCase.trailingClippedBases
152+
cigar.unclippedStart(100) shouldBe testCase.unclippedStart + 100
153+
cigar.unclippedEnd(200) shouldBe testCase.unclippedEnd + 200
154+
}
155+
156+
}
157+
158+
124159
/////////////////////////////////////////////////////////////////////////////
125160
// Tests for the Alignment class
126161
/////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)