diff --git a/allel/__init__.py b/allel/__init__.py index afb55e51..a5bbbe8c 100644 --- a/allel/__init__.py +++ b/allel/__init__.py @@ -31,8 +31,8 @@ plot_pairwise_distance, condensed_coords, condensed_coords_between, \ condensed_coords_within -from .stats.hw import heterozygosity_observed, heterozygosity_expected, \ - inbreeding_coefficient +from .stats.hw import heterozygosity_individual, heterozygosity_observed, \ + heterozygosity_expected, inbreeding_coefficient from .stats.ld import rogers_huff_r, rogers_huff_r_between, \ locate_unlinked, plot_pairwise_ld, windowed_r_squared diff --git a/allel/stats/hw.py b/allel/stats/hw.py index 1ef7e20f..c84837a2 100644 --- a/allel/stats/hw.py +++ b/allel/stats/hw.py @@ -6,7 +6,86 @@ from allel.util import ignore_invalid, asarray_ndim -def heterozygosity_observed(g, fill=np.nan): +def heterozygosity_individual(g, fill=np.nan, corrected=True, ploidy=None): + """Calculate the individual heterozygosity of each sample for + each variant following Hardy (2016). + + Parameters + ---------- + g : array_like, int, shape (n_variants, n_samples, ploidy) + Genotype array. + fill : float, optional + Use this value for variants with invalid inputs. + corrected : bool, optional + If True, values are corrected for ploidy level. + ploidy : array_like, int, (n_variants, n_samples), optional + Specify ploidy of each genotype call. + + Returns + ------- + hi : ndarray, float, shape (n_variants, n_samples) + Observed individual heterozygosity + + Notes + ----- + Individual heterozygosity is calculated assuming polysomic inheritance + for polyploid genotype arrays following Hardy (2016). + + If the ploidy argument is used then the ploidy specified for each + genotype call must be equal to the number of alleles called for that + genotype or else the genotype will be treated as missing. + + Examples + -------- + + >>> import allel + >>> g = allel.GenotypeArray([[[0, 0, -1, -1], [0, 0, 0, 0]], + ... [[0, 1, -1, -1], [0, 0, 0, 1]], + ... [[1, 1, -1, -1], [0, 1, 1, 2]], + ... [[0, -1, -1, -1], [0, 1, 2, 3]], + ... [[-1, -1, -1, -1], [0, 1, -1, -1]]]) + >>> ploidy = [[2, 4], + ... [2, 4], + ... [2, 4], + ... [2, 4], + ... [2, 4]] + >>> allel.heterozygosity_individual(g, ploidy=ploidy) + array([[0. , 0. ], + [1. , 0.5 ], + [0. , 0.83333333], + [ nan, 1. ], + [ nan, nan]]) + + """ + # check inputs + if not hasattr(g, 'ploidy') or not hasattr(g, 'to_allele_counts'): + g = GenotypeArray(g, copy=False) + + # use array ploidy if none provided + ploidy = g.ploidy if ploidy is None else np.array(ploidy) + + # convert to genotype allele counts, remove those not matching ploidy + gac = g.to_allele_counts() + is_called = gac.values.sum(axis=-1) == ploidy + gac[~is_called] = 0 + + # genotype allele frequencies with partial genotypes removed + gaf = gac.to_frequencies() + + # correction for ploidy level + with ignore_invalid(): + correction = ploidy / (ploidy - 1) if corrected else 1 + + # matrix of observed heterozygosity per sample + hi = (1 - np.sum(np.power(gaf, 2), axis=-1)) * correction + + if fill is not np.nan: + hi[np.isnan(hi)] = fill + + return hi + + +def heterozygosity_observed(g, fill=np.nan, corrected=True, ploidy=None): """Calculate the rate of observed heterozygosity for each variant. Parameters @@ -16,6 +95,10 @@ def heterozygosity_observed(g, fill=np.nan): Genotype array. fill : float, optional Use this value for variants where all calls are missing. + corrected : bool, optional + If True, values are corrected for ploidy level. + ploidy : array_like, int, (n_variants, n_samples), optional + Specify ploidy of each genotype call. Returns ------- @@ -23,9 +106,27 @@ def heterozygosity_observed(g, fill=np.nan): ho : ndarray, float, shape (n_variants,) Observed heterozygosity + Notes + ----- + + Observed heterozygosity is calculated assuming polysomic inheritance + for polyploid genotype arrays following Hardy (2016) (also see Meirmans + and Liu 2018). + + By default, observed heterozygosity is corrected for ploidy by + *Ho = Hu * (ploidy / (ploidy - 1))* where *Ho* is the corrected observed + heterozygosity and *Hu* is the uncorrected observed heterozygosity + described by Meirmans and Liu (2018). + + If the ploidy argument is used then the ploidy specified for each + genotype call must be equal to the number of called alleles for that + genotype or else the genotype will be treated as missing. + Examples -------- + Diploid genotype array + >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], @@ -34,20 +135,30 @@ def heterozygosity_observed(g, fill=np.nan): >>> allel.heterozygosity_observed(g) array([0. , 0.33333333, 0. , 0.5 ]) - """ + Tetraploid genotype array - # check inputs - if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'): - g = GenotypeArray(g, copy=False) + >>> import allel + >>> g = allel.GenotypeArray([[[0, 0, 0, 0], [0, 0, 0, 0]], + ... [[0, 0, 0, 0], [0, 0, 0, 1]], + ... [[0, 0, 1, 1], [0, 0, 1, 2]], + ... [[0, 0, 1, 1], [0, 0, -1, -1]], + ... [[0, 1, 2, 3], [-1, -1, -1, -1]]]) + >>> allel.heterozygosity_observed(g) + array([0. , 0.25 , 0.75 , 0.66666667, 1. ]) + + """ + hi = heterozygosity_individual( + g, + fill=np.nan, + corrected=corrected, + ploidy=ploidy + ) + sum_het = np.nansum(hi, axis=-1) - # count hets - n_het = np.asarray(g.count_het(axis=1)) - n_called = np.asarray(g.count_called(axis=1)) + n_called = np.nansum(~np.isnan(hi), axis=-1) - # calculate rate of observed heterozygosity, accounting for variants - # where all calls are missing with ignore_invalid(): - ho = np.where(n_called > 0, n_het / n_called, fill) + ho = np.where(n_called > 0, sum_het / n_called, fill) return ho diff --git a/allel/test/test_stats.py b/allel/test/test_stats.py index b2ee4b6e..ff8220a3 100644 --- a/allel/test/test_stats.py +++ b/allel/test/test_stats.py @@ -385,6 +385,70 @@ def test_windowed_tajima_d(self): class TestHardyWeinberg(unittest.TestCase): + def test_heterozygosity_individual(self): + + # diploid should always be 0 (hom) or 1 (het) + g = GenotypeArray([[[0, 0], [0, 0]], + [[1, 1], [1, 1]], + [[1, 1], [2, 2]], + [[0, 0], [0, 1]], + [[0, 0], [0, 2]], + [[1, 1], [1, 2]], + [[0, 1], [0, 1]], + [[0, 1], [1, 2]], + [[0, 0], [-1, -1]], + [[0, 1], [-1, -1]], + [[-1, -1], [-1, -1]]], dtype='i1') + + hi = allel.heterozygosity_individual(g) + assert np.all(hi[g.is_hom()] == 0) + assert np.all(hi[g.is_het()] == 1) + assert np.all(np.isnan(hi)[g.is_missing()]) + + # mixed ploidy 2x, 4x, 6x + g = GenotypeArray([ + [[0, 0, -1, -1, -1, -1], [0, 0, 0, 0, -1, -1], [0, 0, 0, 0, 0, 0]], + [[1, 1, -1, -1, -1, -1], [0, 0, 0, 1, -1, -1], [0, 0, 0, 0, 0, 1]], + [[0, 1, -1, -1, -1, -1], [0, 0, 1, 1, -1, -1], [0, 0, 0, 0, 1, 1]], + [[1, 2, -1, -1, -1, -1], [0, 0, 1, 2, -1, -1], [0, 0, 0, 1, 1, 1]], + [[3, 6, -1, -1, -1, -1], [0, 1, 2, 3, -1, -1], [0, 0, 0, 0, 1, 2]], + [[0, 0, -1, -1, -1, -1], [0, 2, 2, 5, -1, -1], [0, 0, 0, 1, 1, 2]], + [[0, -1, -1, -1, -1, -1], [0, 1, 1, -1, -1, -1], [0, 0, 1, 1, 2, 2]], + [[0, -1, -1, -1, -1, -1], [0, -1, -1, -1, -1, -1], [0, 0, 1, 2, 3, 4]], + [[-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, 5]] + ]) + ploidy = np.array([ + [2, 4, 6], + [2, 4, 6], + [2, 4, 6], + [2, 4, 6], + [2, 4, 6], + [2, 4, 6], + [2, 4, 6], + [2, 4, 6], + [2, 4, 6], + ]) + # 4x and 6x values from Hardy (2016) + expect = np.array([ + [0, 0/6, 0/15], + [0, 3/6, 5/15], + [1, 4/6, 8/15], + [1, 5/6, 9/15], + [1, 6/6, 9/15], + [0, 5/6, 11/15], + [-1, -1, 12/15], + [-1, -1, 14/15], + [-1, -1, 15/15], + ]) + actual = allel.heterozygosity_individual(g, fill=-1, ploidy=ploidy) + assert_array_almost_equal(expect, actual) + + # mixed ploidy uncorrected + expect /= (ploidy/(ploidy-1)) + expect[expect < 0] = -1 # correct the filled values + actual = allel.heterozygosity_individual(g, fill=-1, ploidy=ploidy, corrected=False) + assert_array_almost_equal(expect, actual) + def test_heterozygosity_observed(self): # diploid @@ -403,7 +467,17 @@ def test_heterozygosity_observed(self): actual = allel.heterozygosity_observed(g, fill=-1) aeq(expect, actual) - # polyploid + # diploid uncorrected + # corrected is uncorrected * (2/1) + expect = [i / 2 if i >= 0 else i for i in expect] + actual = allel.heterozygosity_observed(g, fill=-1, corrected=False) + assert_array_almost_equal(expect, actual) + + # triploid + # individual heterozygosity (Hi) + # dosage of 3:0:0 = Hi of 0/3 + # dosage of 2:1:0 = Hi of 2/3 + # dosage of 1:1:1 = Hi of 3/3 g = GenotypeArray([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [2, 2, 2]], @@ -415,9 +489,15 @@ def test_heterozygosity_observed(self): [[0, 0, 0], [-1, -1, -1]], [[0, 0, 1], [-1, -1, -1]], [[-1, -1, -1], [-1, -1, -1]]], dtype='i1') - expect = [0, 0, 0, .5, .5, .5, 1, 1, 0, 1, -1] + expect = [0, 0, 0, 1/3, 1/3, 3/6, 4/6, 5/6, 0, 2/3, -1] actual = allel.heterozygosity_observed(g, fill=-1) - aeq(expect, actual) + assert_array_almost_equal(expect, actual) + + # triploid uncorrected + # corrected is uncorrected * (3/2) + expect = [i / (3/2) if i >= 0 else i for i in expect] + actual = allel.heterozygosity_observed(g, fill=-1, corrected=False) + assert_array_almost_equal(expect, actual) def test_heterozygosity_expected(self):