Skip to content

Commit 87f07fe

Browse files
committed
add tests
1 parent f6052af commit 87f07fe

File tree

2 files changed

+145
-56
lines changed

2 files changed

+145
-56
lines changed

src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala

+72-39
Original file line numberDiff line numberDiff line change
@@ -97,69 +97,102 @@ object DownsampleVcf extends LazyLogging {
9797
def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = {
9898
val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}"))
9999
val newAds = downsampleADs(oldAds, proportion, random)
100-
val Seq(aa, ab, bb) = computePls(newAds)
101-
val Seq(alleleA, alleleB) = gt.alleles.toSeq
102-
103-
val calls = {
104-
if (aa == 0 && ab == 0 && bb == 0) IndexedSeq(NoCallAllele, NoCallAllele)
105-
else if (aa < ab && aa < bb) IndexedSeq(alleleA, alleleA)
106-
else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB)
107-
else IndexedSeq(alleleA, alleleB)
108-
}
109-
gt.copy(attrs=Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
110-
}
111-
112-
/**
113-
* Compute the genotype likelihoods given the allele depths, assuming a diploid genotype (i.e.
114-
* two allele depths).
115-
* @param ads The input depths for the two alleles A and B.
116-
* @return a list of three likelihoods for the alleles AA, AB, and BB.
117-
*/
118-
def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = {
119-
require(ads.length == 2, "there must be exactly two allele depths")
120-
val likelihoods = Likelihoods(ads(0), ads(1))
121-
IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt)
100+
val likelihoods = Likelihoods(newAds)
101+
val pls = likelihoods.pls
102+
val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq)
103+
gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
122104
}
123105

124106
object Likelihoods {
125-
/** Computes the likelihoods for each possible genotype.
126-
*
107+
/** Computes the likelihoods for each possible biallelic genotype.
127108
* @param alleleDepthA the reference allele depth
128109
* @param alleleDepthB the alternate allele depth
129110
* @param epsilon the error rate for genotyping
130111
* @return a new `Likelihood` that has the likelihoods of AA, AB, and BB
131112
*/
132-
def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double=0.01): Likelihoods = {
113+
def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): Likelihoods = {
133114
val aGivenAA = log10(1 - epsilon)
134115
val aGivenBB = log10(epsilon)
135116
val aGivenAB = log10((1 - epsilon) / 2)
136117

137-
val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) * -10
138-
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) * -10
139-
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) * -10
118+
val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB))
119+
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA))
120+
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB)
140121

141-
val minGL = math.min(math.min(rawGlAA, rawGlAB), rawGlBB)
122+
Likelihoods(2, IndexedSeq(rawGlAA, rawGlAB, rawGlBB))
123+
}
142124

125+
/** Computes the likelihoods for each possible multiallelic genotype.
126+
* @param alleleDepths the sequence of allele depths in the order specified in the VCF
127+
* @param epsilon the error rate for genotyping
128+
* @return a new `Likelihood` that has the likelihoods of all possible genotypes in the order
129+
* specified in VFC spec for the GL/PL tags.
130+
*/
131+
def multiallelic(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = {
132+
val numAlleles = alleleDepths.length
133+
// probabilities associated with each possible genotype for a pair of alleles
134+
val probs: Array[Double] = Array(
135+
math.log10(epsilon),
136+
math.log10((1 - epsilon) / 2),
137+
math.log10(1 - epsilon)
138+
)
139+
// raw genotype log-likelihoods
143140
Likelihoods(
144-
aa = rawGlAA - minGL,
145-
ab = rawGlAB - minGL,
146-
bb = rawGlBB - minGL
141+
numAlleles,
142+
(0 until numAlleles).flatMap(b =>
143+
(0 to b).map(a =>
144+
(0 until numAlleles).map(allele =>
145+
probs(Array(a, b).count(_ == allele)) * alleleDepths(allele)
146+
).sum
147+
)
148+
)
147149
)
148150
}
151+
152+
def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = {
153+
require(alleleDepths.length >= 2, "at least two alleles are required to calculate genotype likelihoods")
154+
if (alleleDepths.length > 2) multiallelic(alleleDepths, epsilon)
155+
else biallelic(alleleDepths(0), alleleDepths(1), epsilon)
156+
}
149157
}
150158

151-
/** Stores the log10(likelihoods) for all possible bi-allelic genotypes.
152-
*
153-
* @param aa likelihood of AA
154-
* @param ab likelihood of AB
155-
* @param bb likelihood of BB
159+
/** Stores the log10(likelihoods) for all possible genotypes.
160+
* @param numAlleles the number of alleles the variant has
161+
* @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec
156162
*/
157-
case class Likelihoods(aa: Double, ab: Double, bb: Double) {
163+
case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) {
158164
/**
159165
* Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag).
160166
* @return a list of phred-scaled likelihooodS for AA, AB, BB.
161167
*/
162-
def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt)
168+
def pls: IndexedSeq[Int] = {
169+
// subtract the min value so the smallest GL is 0, then multiply by -10 and convert to
170+
// Int to make it PHRED-scale
171+
val rawPL = genotypeLikelihoods.map(gl => gl * -10)
172+
val minPL = rawPL.min
173+
rawPL.map(pl => (pl - minPL).round.toInt)
174+
}
175+
176+
def mostLikelyGenotype: Option[(Int, Int)] = {
177+
val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0)
178+
minIndexes.length match {
179+
case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0")
180+
case 1 => {
181+
val genotypes =
182+
for (b <- 0 until numAlleles; a <- 0 to b)
183+
yield (a, b)
184+
Some(genotypes(minIndexes.head._2))
185+
}
186+
case _ => None // if multiple genotypes are most likely, don't make a call
187+
}
188+
}
189+
190+
def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele] = {
191+
mostLikelyGenotype match {
192+
case None => IndexedSeq(NoCallAllele, NoCallAllele)
193+
case Some((a, b)) => IndexedSeq(alleles(a), alleles(b))
194+
}
195+
}
163196
}
164197
}
165198

src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala

+73-17
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import com.fulcrumgenomics.util.Metric
66
import com.fulcrumgenomics.vcf.api.Allele.SimpleAllele
77
import com.fulcrumgenomics.vcf.api.{Allele, AlleleSet, Genotype, Variant}
88
import com.fulcrumgenomics.testing.UnitSpec
9-
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, computePls, downsampleAndRegenotype}
9+
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, downsampleAndRegenotype}
1010

1111
import scala.util.Random
1212

@@ -187,7 +187,7 @@ class DownsampleVcfTest extends UnitSpec {
187187
"DownsampleVcf.computePls" should "return new PLs that are not always 0,0,0" in {
188188
val ads = IndexedSeq[Int](0, 100)
189189
val expected = IndexedSeq(1996, 301, 0)
190-
val newlikelihoods = computePls(ads)
190+
val newlikelihoods = Likelihoods(ads).pls
191191
newlikelihoods should contain theSameElementsInOrderAs expected
192192
}
193193

@@ -196,51 +196,57 @@ class DownsampleVcfTest extends UnitSpec {
196196
*/
197197

198198
"DownsampleVcf.Likelihoods" should "return ref if all allele depths are zero" in {
199-
val likelihood = Likelihoods(alleleDepthA=0, alleleDepthB=0)
199+
val likelihood = Likelihoods(IndexedSeq(0, 0))
200200
val expected = IndexedSeq[Int](0, 0, 0)
201201
likelihood.pls.length shouldBe expected.length
202202
likelihood.pls should contain theSameElementsInOrderAs expected
203203
}
204204

205205
it should "return a likelihood of 0 for AA if there are only ref alleles observed" in {
206-
val likelihood = Likelihoods(alleleDepthA = 10, alleleDepthB = 0)
206+
val likelihood = Likelihoods(IndexedSeq(10, 0))
207207
val expected = IndexedSeq[Int](0, 30, 200)
208208
likelihood.pls should contain theSameElementsInOrderAs expected
209209
}
210210

211211
it should "return a likelihood of 0 for BB if there are only alt alleles observed" in {
212-
val likelihood = Likelihoods(alleleDepthA = 0, alleleDepthB = 10)
212+
val likelihood = Likelihoods(IndexedSeq(0, 10))
213213
val expected = IndexedSeq[Int](200, 30, 0)
214214
likelihood.pls should contain theSameElementsInOrderAs expected
215215
}
216216

217217
it should "return a likelihood of 0 for AB if there are an equal number of ref and alt alleles" in {
218-
val likelihood = Likelihoods(alleleDepthA = 5, alleleDepthB = 5)
218+
val likelihood = Likelihoods(IndexedSeq(5, 5))
219219
val expected = IndexedSeq[Int](70, 0, 70)
220220
likelihood.pls should contain theSameElementsInOrderAs expected
221221
}
222222

223223
it should "return a likelihood of 0 for AA if the AD A >> AD B" in {
224-
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 2)
224+
val likelihood = Likelihoods(IndexedSeq(15, 2))
225225
likelihood.pls(0) == 0
226226
}
227227

228228
it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in {
229-
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 17)
229+
val likelihood = Likelihoods(IndexedSeq(15, 17))
230230
likelihood.pls(1) == 0
231231
}
232232

233233
it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in {
234-
val likelihood = Likelihoods(alleleDepthA = 3, alleleDepthB = 30)
234+
val likelihood = Likelihoods(IndexedSeq(3, 30))
235235
likelihood.pls(2) == 0
236236
}
237237

238238
it should "return correct values when there are very few reads" in {
239-
Likelihoods(0, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
240-
Likelihoods(1, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
241-
Likelihoods(1, 1).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
242-
Likelihoods(0, 2).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
243-
Likelihoods(1, 2).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
239+
Likelihoods(IndexedSeq(0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
240+
Likelihoods(IndexedSeq(1, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
241+
Likelihoods(IndexedSeq(1, 1)).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
242+
Likelihoods(IndexedSeq(0, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
243+
Likelihoods(IndexedSeq(1, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
244+
}
245+
246+
it should "return correct values for multi-allelic variants" in {
247+
Likelihoods(IndexedSeq(0, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0, 0, 0, 0)
248+
Likelihoods(IndexedSeq(10, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 30, 200, 30, 200, 200)
249+
Likelihoods(IndexedSeq(10, 10, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(139, 0, 139, 169, 169, 339)
244250
}
245251

246252

@@ -251,10 +257,10 @@ class DownsampleVcfTest extends UnitSpec {
251257
Genotype(alleles=AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt))),
252258
sample=sample,
253259
calls=IndexedSeq[Allele](Allele(ref), Allele(alt)),
254-
attrs=Map("AD" -> ads, "PL" -> Likelihoods(alleleDepthA = ads(0), alleleDepthB = ads(1))))
260+
attrs=Map("AD" -> ads, "PL" -> Likelihoods(ads))
261+
)
255262
}
256263

257-
258264
"DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in {
259265
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0))
260266
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01)
@@ -298,6 +304,30 @@ class DownsampleVcfTest extends UnitSpec {
298304
newGeno.calls should contain theSameElementsInOrderAs expected
299305
}
300306

307+
/*
308+
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
309+
*/
310+
private def makeTriallelicGt(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Genotype = {
311+
val likelihoods = Likelihoods(ads)
312+
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
313+
val calls = likelihoods.mostLikelyCall(alleles.toSeq)
314+
Genotype(alleles, sample=sample, calls=calls, attrs=Map("AD" -> ads, "PL" -> likelihoods.pls))
315+
}
316+
317+
it should "return ref,alt1 for a tri-allelic genotype if those alleles have the highest depth" in {
318+
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
319+
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
320+
val expected = IndexedSeq(Allele("A"), Allele("T"))
321+
newGeno.calls should contain theSameElementsInOrderAs expected
322+
}
323+
324+
it should "return alt1,alt2 for a tri-allelic genotype if those alleles have the highest depth" in {
325+
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
326+
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
327+
val expected = IndexedSeq(Allele("T"), Allele("G"))
328+
newGeno.calls should contain theSameElementsInOrderAs expected
329+
}
330+
301331
/*
302332
testing DownsampleVcf.downsampleAndRegenotype on Variant
303333
*/
@@ -306,7 +336,7 @@ class DownsampleVcfTest extends UnitSpec {
306336
Variant(chrom="1",
307337
pos=10,
308338
alleles=AlleleSet(ref=Allele(ref), alts=Allele(alt)),
309-
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample = sample))
339+
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample=sample))
310340
)
311341
}
312342

@@ -345,6 +375,32 @@ class DownsampleVcfTest extends UnitSpec {
345375
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
346376
}
347377

378+
/*
379+
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
380+
*/
381+
private def makeTriallelicVariant(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Variant = {
382+
val likelihoods = Likelihoods(ads)
383+
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
384+
Variant(chrom="1",
385+
pos=10,
386+
alleles=alleles,
387+
genotypes=Map(sample -> makeTriallelicGt(ref=ref, alt1=alt1, alt2=alt2, ads=ads, sample=sample)))
388+
}
389+
390+
it should "return ref,alt1 for a tri-allelic variant if those alleles have the highest depth" in {
391+
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
392+
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
393+
val expected = IndexedSeq(Allele("A"), Allele("T"))
394+
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
395+
}
396+
397+
it should "return alt1,alt2 for a tri-allelic variant if those alleles have the highest depth" in {
398+
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
399+
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
400+
val expected = IndexedSeq(Allele("T"), Allele("G"))
401+
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
402+
}
403+
348404
private val sample = "test1"
349405
private val builder = VcfBuilder(samples=Seq(sample))
350406
builder.add(chrom="chr1", pos=100, id="1", alleles=Seq("A", "C"), info=Map(),

0 commit comments

Comments
 (0)