Skip to content

Commit 75de344

Browse files
authored
Merge pull request #976 from fulcrumgenomics/multiallelic-likelihoods
Add support for multi-allelic variants
2 parents 9a2d510 + 0232972 commit 75de344

File tree

2 files changed

+186
-60
lines changed

2 files changed

+186
-60
lines changed

src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala

+78-40
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import com.fulcrumgenomics.vcf.DownsampleVcf.{downsampleAndRegenotype, winnowVar
1313

1414
import scala.math.log10
1515
import scala.util.Random
16+
import scala.tools.nsc.doc.html.HtmlTags
1617

1718
object DownsampleVcf extends LazyLogging {
1819
/** Removes variants that are within a specified distance from a previous variant.
@@ -97,69 +98,106 @@ object DownsampleVcf extends LazyLogging {
9798
def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = {
9899
val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}"))
99100
val newAds = downsampleADs(oldAds, proportion, random)
100-
val Seq(aa, ab, bb) = computePls(newAds)
101-
val Seq(alleleA, alleleB) = gt.alleles.toSeq
102-
103-
val calls = {
104-
if (aa == 0 && ab == 0 && bb == 0) IndexedSeq(NoCallAllele, NoCallAllele)
105-
else if (aa < ab && aa < bb) IndexedSeq(alleleA, alleleA)
106-
else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB)
107-
else IndexedSeq(alleleA, alleleB)
108-
}
109-
gt.copy(attrs=Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
110-
}
111-
112-
/**
113-
* Compute the genotype likelihoods given the allele depths, assuming a diploid genotype (i.e.
114-
* two allele depths).
115-
* @param ads The input depths for the two alleles A and B.
116-
* @return a list of three likelihoods for the alleles AA, AB, and BB.
117-
*/
118-
def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = {
119-
require(ads.length == 2, "there must be exactly two allele depths")
120-
val likelihoods = Likelihoods(ads(0), ads(1))
121-
IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt)
101+
val likelihoods = Likelihoods(newAds)
102+
val pls = likelihoods.pls
103+
val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq)
104+
gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
122105
}
123106

124107
object Likelihoods {
125-
/** Computes the likelihoods for each possible genotype.
126-
*
108+
/**Converts a sequence of log-likelihoods to phred-scale by 1) multiplying each by -10, 2)
109+
* subtracting from each the min value so the smallest value is 0, and 3) rounding to the
110+
* nearest integer.
111+
*/
112+
def logToPhredLikelihoods(logLikelihoods: IndexedSeq[Double]): IndexedSeq[Int] = {
113+
val rawPL = logLikelihoods.map(gl => gl * -10)
114+
val minPL = rawPL.min
115+
rawPL.map(pl => (pl - minPL).round.toInt)
116+
}
117+
118+
/** Computes the likelihoods for each possible biallelic genotype.
127119
* @param alleleDepthA the reference allele depth
128120
* @param alleleDepthB the alternate allele depth
129121
* @param epsilon the error rate for genotyping
130122
* @return a new `Likelihood` that has the likelihoods of AA, AB, and BB
131123
*/
132-
def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double=0.01): Likelihoods = {
124+
def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): IndexedSeq[Double] = {
133125
val aGivenAA = log10(1 - epsilon)
134126
val aGivenBB = log10(epsilon)
135127
val aGivenAB = log10((1 - epsilon) / 2)
136128

137-
val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) * -10
138-
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) * -10
139-
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) * -10
129+
val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB))
130+
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA))
131+
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB)
140132

141-
val minGL = math.min(math.min(rawGlAA, rawGlAB), rawGlBB)
133+
IndexedSeq(rawGlAA, rawGlAB, rawGlBB)
134+
}
142135

143-
Likelihoods(
144-
aa = rawGlAA - minGL,
145-
ab = rawGlAB - minGL,
146-
bb = rawGlBB - minGL
136+
/** Computes the likelihoods for each possible genotype given a sequence of read depths for any
137+
* number of alleles.
138+
* @param alleleDepths the sequence of allele depths in the order specified in the VCF
139+
* @param epsilon the error rate for genotyping
140+
* @return a new `Likelihood` that has the log likelihoods of all possible genotypes in the
141+
* order specified in VFC spec for the GL/PL tags.
142+
*/
143+
def generalized(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): IndexedSeq[Double] = {
144+
val numAlleles = alleleDepths.length
145+
// probabilities associated with each possible genotype for a pair of alleles
146+
val logProbs: Array[Double] = Array(
147+
math.log10(epsilon),
148+
math.log10((1 - epsilon) / 2),
149+
math.log10(1 - epsilon)
147150
)
151+
// compute genotype log-likelihoods
152+
(0 until numAlleles).flatMap(b =>
153+
(0 to b).map(a =>
154+
(0 until numAlleles).map(allele =>
155+
logProbs(Array(a, b).count(_ == allele)) * alleleDepths(allele)
156+
).sum
157+
)
158+
)
159+
}
160+
161+
def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = {
162+
val numAlleles = alleleDepths.length
163+
require(numAlleles >= 2, "at least two alleles are required to calculate genotype likelihoods")
164+
Likelihoods(numAlleles, generalized(alleleDepths, epsilon))
148165
}
149166
}
150167

151-
/** Stores the log10(likelihoods) for all possible bi-allelic genotypes.
152-
*
153-
* @param aa likelihood of AA
154-
* @param ab likelihood of AB
155-
* @param bb likelihood of BB
168+
/** Stores the log10(likelihoods) for all possible genotypes.
169+
* @param numAlleles the number of alleles the variant has
170+
* @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec
156171
*/
157-
case class Likelihoods(aa: Double, ab: Double, bb: Double) {
172+
case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) {
158173
/**
159174
* Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag).
160175
* @return a list of phred-scaled likelihooodS for AA, AB, BB.
161176
*/
162-
def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt)
177+
def pls: IndexedSeq[Int] = {
178+
Likelihoods.logToPhredLikelihoods(genotypeLikelihoods)
179+
}
180+
181+
def mostLikelyGenotype: Option[(Int, Int)] = {
182+
val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0)
183+
minIndexes.length match {
184+
case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0")
185+
case 1 => {
186+
val genotypes =
187+
for (b <- 0 until numAlleles; a <- 0 to b)
188+
yield (a, b)
189+
Some(genotypes(minIndexes.head._2))
190+
}
191+
case _ => None // if multiple genotypes are most likely, don't make a call
192+
}
193+
}
194+
195+
def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele] = {
196+
mostLikelyGenotype match {
197+
case None => IndexedSeq(NoCallAllele, NoCallAllele)
198+
case Some((a, b)) => IndexedSeq(alleles(a), alleles(b))
199+
}
200+
}
163201
}
164202
}
165203

src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala

+108-20
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import com.fulcrumgenomics.util.Metric
66
import com.fulcrumgenomics.vcf.api.Allele.SimpleAllele
77
import com.fulcrumgenomics.vcf.api.{Allele, AlleleSet, Genotype, Variant}
88
import com.fulcrumgenomics.testing.UnitSpec
9-
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, computePls, downsampleAndRegenotype}
9+
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, downsampleAndRegenotype}
1010

1111
import scala.util.Random
1212

@@ -187,7 +187,7 @@ class DownsampleVcfTest extends UnitSpec {
187187
"DownsampleVcf.computePls" should "return new PLs that are not always 0,0,0" in {
188188
val ads = IndexedSeq[Int](0, 100)
189189
val expected = IndexedSeq(1996, 301, 0)
190-
val newlikelihoods = computePls(ads)
190+
val newlikelihoods = Likelihoods(ads).pls
191191
newlikelihoods should contain theSameElementsInOrderAs expected
192192
}
193193

@@ -196,51 +196,89 @@ class DownsampleVcfTest extends UnitSpec {
196196
*/
197197

198198
"DownsampleVcf.Likelihoods" should "return ref if all allele depths are zero" in {
199-
val likelihood = Likelihoods(alleleDepthA=0, alleleDepthB=0)
199+
val likelihood = Likelihoods(IndexedSeq(0, 0))
200200
val expected = IndexedSeq[Int](0, 0, 0)
201201
likelihood.pls.length shouldBe expected.length
202202
likelihood.pls should contain theSameElementsInOrderAs expected
203203
}
204204

205+
it should "return correct results for basic cases" in {
206+
val e = 0.01
207+
val cases: IndexedSeq[(IndexedSeq[Int], IndexedSeq[Double])] = IndexedSeq(
208+
(IndexedSeq(1, 0), IndexedSeq(1 - e, 0.5, e)),
209+
(IndexedSeq(0, 1), IndexedSeq(e, 0.5, 1 - e)),
210+
(IndexedSeq(1, 1), IndexedSeq((1 - e) * e, 0.25, (1 - e) * e)),
211+
(IndexedSeq(2, 0), IndexedSeq(math.pow((1 - e), 2), 0.25, math.pow(e, 2))),
212+
(IndexedSeq(0, 0, 1), IndexedSeq(e, e, e, 0.5, 0.5, 1 - e)),
213+
)
214+
cases.foreach { case (input, output) =>
215+
val likelihood = Likelihoods(input, e)
216+
val logOutput = output.map(p => math.log10(p))
217+
likelihood.pls.length shouldBe logOutput.length
218+
likelihood.pls should contain theSameElementsInOrderAs DownsampleVcf.Likelihoods.logToPhredLikelihoods(logOutput)
219+
}
220+
}
221+
222+
it should "return the same results for biallelic and generalized algorithm" in {
223+
val e = 0.01
224+
val cases: IndexedSeq[(IndexedSeq[Int], IndexedSeq[Double])] = IndexedSeq(
225+
(IndexedSeq(1, 0), IndexedSeq(1 - e, 0.5, e)),
226+
(IndexedSeq(0, 1), IndexedSeq(e, 0.5, 1 - e)),
227+
(IndexedSeq(1, 1), IndexedSeq((1 - e) * e, 0.25, (1 - e) * e)),
228+
(IndexedSeq(2, 0), IndexedSeq(math.pow((1 - e), 2), 0.25, math.pow(e, 2))),
229+
)
230+
cases.foreach { case (input, output) =>
231+
val biallelic = Likelihoods(2, DownsampleVcf.Likelihoods.biallelic(input(0), input(1), e))
232+
val generalized = Likelihoods(2, DownsampleVcf.Likelihoods.generalized(input, e))
233+
biallelic.pls should contain theSameElementsInOrderAs generalized.pls
234+
}
235+
}
236+
205237
it should "return a likelihood of 0 for AA if there are only ref alleles observed" in {
206-
val likelihood = Likelihoods(alleleDepthA = 10, alleleDepthB = 0)
238+
val likelihood = Likelihoods(IndexedSeq(10, 0))
207239
val expected = IndexedSeq[Int](0, 30, 200)
208240
likelihood.pls should contain theSameElementsInOrderAs expected
209241
}
210242

211243
it should "return a likelihood of 0 for BB if there are only alt alleles observed" in {
212-
val likelihood = Likelihoods(alleleDepthA = 0, alleleDepthB = 10)
244+
val likelihood = Likelihoods(IndexedSeq(0, 10))
213245
val expected = IndexedSeq[Int](200, 30, 0)
214246
likelihood.pls should contain theSameElementsInOrderAs expected
215247
}
216248

217249
it should "return a likelihood of 0 for AB if there are an equal number of ref and alt alleles" in {
218-
val likelihood = Likelihoods(alleleDepthA = 5, alleleDepthB = 5)
250+
val likelihood = Likelihoods(IndexedSeq(5, 5))
219251
val expected = IndexedSeq[Int](70, 0, 70)
220252
likelihood.pls should contain theSameElementsInOrderAs expected
221253
}
222254

223255
it should "return a likelihood of 0 for AA if the AD A >> AD B" in {
224-
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 2)
225-
likelihood.pls(0) == 0
256+
val likelihood = Likelihoods(IndexedSeq(15, 2))
257+
assert(likelihood.pls(0) == 0)
226258
}
227259

228260
it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in {
229-
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 17)
230-
likelihood.pls(1) == 0
261+
val likelihood = Likelihoods(IndexedSeq(15, 17))
262+
assert(likelihood.pls(1) == 0)
231263
}
232264

233265
it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in {
234-
val likelihood = Likelihoods(alleleDepthA = 3, alleleDepthB = 30)
235-
likelihood.pls(2) == 0
266+
val likelihood = Likelihoods(IndexedSeq(3, 30))
267+
assert(likelihood.pls(2) == 0)
236268
}
237269

238270
it should "return correct values when there are very few reads" in {
239-
Likelihoods(0, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
240-
Likelihoods(1, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
241-
Likelihoods(1, 1).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
242-
Likelihoods(0, 2).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
243-
Likelihoods(1, 2).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
271+
Likelihoods(IndexedSeq(0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
272+
Likelihoods(IndexedSeq(1, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
273+
Likelihoods(IndexedSeq(1, 1)).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
274+
Likelihoods(IndexedSeq(0, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
275+
Likelihoods(IndexedSeq(1, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
276+
}
277+
278+
it should "return correct values for multi-allelic variants" in {
279+
Likelihoods(IndexedSeq(0, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0, 0, 0, 0)
280+
Likelihoods(IndexedSeq(10, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 30, 200, 30, 200, 200)
281+
Likelihoods(IndexedSeq(10, 10, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(139, 0, 139, 169, 169, 339)
244282
}
245283

246284

@@ -251,10 +289,10 @@ class DownsampleVcfTest extends UnitSpec {
251289
Genotype(alleles=AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt))),
252290
sample=sample,
253291
calls=IndexedSeq[Allele](Allele(ref), Allele(alt)),
254-
attrs=Map("AD" -> ads, "PL" -> Likelihoods(alleleDepthA = ads(0), alleleDepthB = ads(1))))
292+
attrs=Map("AD" -> ads, "PL" -> Likelihoods(ads))
293+
)
255294
}
256295

257-
258296
"DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in {
259297
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0))
260298
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01)
@@ -298,6 +336,30 @@ class DownsampleVcfTest extends UnitSpec {
298336
newGeno.calls should contain theSameElementsInOrderAs expected
299337
}
300338

339+
/*
340+
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
341+
*/
342+
private def makeTriallelicGt(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Genotype = {
343+
val likelihoods = Likelihoods(ads)
344+
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
345+
val calls = likelihoods.mostLikelyCall(alleles.toSeq)
346+
Genotype(alleles, sample=sample, calls=calls, attrs=Map("AD" -> ads, "PL" -> likelihoods.pls))
347+
}
348+
349+
it should "return ref,alt1 for a tri-allelic genotype if those alleles have the highest depth" in {
350+
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
351+
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
352+
val expected = IndexedSeq(Allele("A"), Allele("T"))
353+
newGeno.calls should contain theSameElementsInOrderAs expected
354+
}
355+
356+
it should "return alt1,alt2 for a tri-allelic genotype if those alleles have the highest depth" in {
357+
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
358+
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
359+
val expected = IndexedSeq(Allele("T"), Allele("G"))
360+
newGeno.calls should contain theSameElementsInOrderAs expected
361+
}
362+
301363
/*
302364
testing DownsampleVcf.downsampleAndRegenotype on Variant
303365
*/
@@ -306,7 +368,7 @@ class DownsampleVcfTest extends UnitSpec {
306368
Variant(chrom="1",
307369
pos=10,
308370
alleles=AlleleSet(ref=Allele(ref), alts=Allele(alt)),
309-
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample = sample))
371+
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample=sample))
310372
)
311373
}
312374

@@ -345,6 +407,32 @@ class DownsampleVcfTest extends UnitSpec {
345407
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
346408
}
347409

410+
/*
411+
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
412+
*/
413+
private def makeTriallelicVariant(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Variant = {
414+
val likelihoods = Likelihoods(ads)
415+
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
416+
Variant(chrom="1",
417+
pos=10,
418+
alleles=alleles,
419+
genotypes=Map(sample -> makeTriallelicGt(ref=ref, alt1=alt1, alt2=alt2, ads=ads, sample=sample)))
420+
}
421+
422+
it should "return ref,alt1 for a tri-allelic variant if those alleles have the highest depth" in {
423+
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
424+
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
425+
val expected = IndexedSeq(Allele("A"), Allele("T"))
426+
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
427+
}
428+
429+
it should "return alt1,alt2 for a tri-allelic variant if those alleles have the highest depth" in {
430+
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
431+
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
432+
val expected = IndexedSeq(Allele("T"), Allele("G"))
433+
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
434+
}
435+
348436
private val sample = "test1"
349437
private val builder = VcfBuilder(samples=Seq(sample))
350438
builder.add(chrom="chr1", pos=100, id="1", alleles=Seq("A", "C"), info=Map(),

0 commit comments

Comments
 (0)