Skip to content

Commit f8aeca2

Browse files
ch-krCopilotmike-w-wilson
authored
Add coverage entry agg function (#790)
* create get_coverage_agg_func * add dp_field arg to compute_coverage_stats and add periods to docstring * move dp_field per copilot suggestion * Update gnomad/utils/sparse_mt.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Add tests for get_coverage_agg_func * Remove print statements for debugging in test_sparse_mt * Switch to hail's NaN and remove hardocded expected value -- update tests to use transofrmed DP, not raw, as it will be used in prod * Remove pytest import and sample_ht as it wasn't being accessed * Remove unneeded tests and misleading test names * Update tests/utils/test_sparse_mt.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/utils/test_sparse_mt.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Remove unneeded sparse_mt test * asked cursor to add periods to comments * asked cursor to rename functions _ if they weren't being used * asked cursor to remove irrelevant comment * asked cursor to rename function and remove unncessary max_cov_bin=50 call * unused transform_func > _ * asked cursor to remove redundant custom field name tests * asked cursor to update test_transform_and_aggregation_integration * improve median approx test documentation * ask cursor to fix its comment 'accept actual behavior' * ask cursor to fix vague > 0 assert * add link to hail docs --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Mike Wilson <mwilson@broadinstitute.org>
1 parent 5b0bdbe commit f8aeca2

File tree

2 files changed

+378
-16
lines changed

2 files changed

+378
-16
lines changed

gnomad/utils/sparse_mt.py

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,36 @@ def compute_stats_per_ref_site(
12751275
return ht
12761276

12771277

1278+
def get_coverage_agg_func(
1279+
dp_field: str = "DP", max_cov_bin: int = 100
1280+
) -> Tuple[Callable, Callable]:
1281+
"""
1282+
Get a transformation and aggregation function for computing coverage.
1283+
1284+
Can be used as an entry aggregation function in `compute_stats_per_ref_site`.
1285+
1286+
:param dp_field: Depth field to use for computing coverage. Default is 'DP'.
1287+
:param max_cov_bin: Maximum coverage bin (used when computing samples over X bin). Default is 100.
1288+
:return: Tuple of functions to transform and aggregate coverage.
1289+
"""
1290+
return (
1291+
lambda t: hl.if_else(
1292+
hl.is_missing(t[dp_field]) | hl.is_nan(t[dp_field]), 0, t[dp_field]
1293+
),
1294+
lambda dp: hl.struct(
1295+
# This expression creates a counter DP -> number of samples for DP
1296+
# between 0 and max_cov_bin.
1297+
coverage_counter=hl.agg.counter(hl.min(max_cov_bin, dp)),
1298+
mean=hl.bind(
1299+
lambda mean_dp: hl.if_else(hl.is_nan(mean_dp), 0, mean_dp),
1300+
hl.agg.mean(dp),
1301+
),
1302+
median_approx=hl.or_else(hl.agg.approx_median(dp), 0),
1303+
total_DP=hl.agg.sum(dp),
1304+
),
1305+
)
1306+
1307+
12781308
def compute_coverage_stats(
12791309
mtds: Union[hl.MatrixTable, hl.vds.VariantDataset],
12801310
reference_ht: hl.Table,
@@ -1283,6 +1313,7 @@ def compute_coverage_stats(
12831313
row_key_fields: List[str] = ["locus"],
12841314
strata_expr: Optional[List[Dict[str, hl.expr.StringExpression]]] = None,
12851315
group_membership_ht: Optional[hl.Table] = None,
1316+
dp_field: str = "DP",
12861317
) -> hl.Table:
12871318
"""
12881319
Compute coverage statistics for every base of the `reference_ht` provided.
@@ -1297,18 +1328,19 @@ def compute_coverage_stats(
12971328
computed on. It needs to be keyed by `locus`. The `reference_ht` can e.g. be
12981329
created using `get_reference_ht`.
12991330
1300-
:param mtds: Input sparse MT or VDS
1301-
:param reference_ht: Input reference HT
1302-
:param interval_ht: Optional Table containing intervals to filter to
1303-
:param coverage_over_x_bins: List of boundaries for computing samples over X
1331+
:param mtds: Input sparse MT or VDS.
1332+
:param reference_ht: Input reference HT.
1333+
:param interval_ht: Optional Table containing intervals to filter to.
1334+
:param coverage_over_x_bins: List of boundaries for computing samples over X.
13041335
:param row_key_fields: List of row key fields to use for joining `mtds` with
1305-
`reference_ht`
1336+
`reference_ht`.
13061337
:param strata_expr: Optional list of dicts containing expressions to stratify the
13071338
coverage stats by. Only one of `group_membership_ht` or `strata_expr` can be
13081339
specified.
13091340
:param group_membership_ht: Optional Table containing group membership annotations
13101341
to stratify the coverage stats by. Only one of `group_membership_ht` or
13111342
`strata_expr` can be specified.
1343+
:param dp_field: Name of sample depth field. Default is DP.
13121344
:return: Table with per-base coverage stats.
13131345
"""
13141346
is_vds = isinstance(mtds, hl.vds.VariantDataset)
@@ -1331,16 +1363,8 @@ def compute_coverage_stats(
13311363
max_cov_bin = cov_bins[-1]
13321364
cov_bins = hl.array(cov_bins)
13331365
entry_agg_funcs = {
1334-
"coverage_stats": (
1335-
lambda t: hl.if_else(hl.is_missing(t.DP) | hl.is_nan(t.DP), 0, t.DP),
1336-
lambda dp: hl.struct(
1337-
# This expression creates a counter DP -> number of samples for DP
1338-
# between 0 and max_cov_bin.
1339-
coverage_counter=hl.agg.counter(hl.min(max_cov_bin, dp)),
1340-
mean=hl.if_else(hl.is_nan(hl.agg.mean(dp)), 0, hl.agg.mean(dp)),
1341-
median_approx=hl.or_else(hl.agg.approx_median(dp), 0),
1342-
total_DP=hl.agg.sum(dp),
1343-
),
1366+
"coverage_stats": get_coverage_agg_func(
1367+
dp_field=dp_field, max_cov_bin=max_cov_bin
13441368
)
13451369
}
13461370

@@ -1350,7 +1374,7 @@ def compute_coverage_stats(
13501374
entry_agg_funcs,
13511375
row_key_fields=row_key_fields,
13521376
interval_ht=interval_ht,
1353-
entry_keep_fields=[gt_field, "DP"],
1377+
entry_keep_fields=[gt_field, dp_field],
13541378
strata_expr=strata_expr,
13551379
group_membership_ht=group_membership_ht,
13561380
)

0 commit comments

Comments
 (0)