Skip to content

Commit cf4219f

Browse files
authored
feat: --maf option for filtering output of transform command (#274)
1 parent 81429ae commit cf4219f

17 files changed

+168
-93
lines changed

.devcontainer.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
"extensions": [
2323
"ms-python.python",
2424
"ms-vscode.live-server",
25-
"ms-python.black-formatter",
25+
"ms-python.black-formatter@2025.2.0",
2626
"GitHub.vscode-github-actions"
2727
],
2828
"settings": {

haptools/__main__.py

+9
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,13 @@ def simphenotype(
622622
default=False,
623623
help="Also transform using VCF 'POP' FORMAT field and 'ancestry' .hap extra field",
624624
)
625+
@click.option(
626+
"--maf",
627+
type=float,
628+
default=None,
629+
show_default="all haplotypes",
630+
help="Do not output haplotypes with an MAF below this value",
631+
)
625632
@click.option(
626633
"-o",
627634
"--output",
@@ -649,6 +656,7 @@ def transform(
649656
chunk_size: int = None,
650657
discard_missing: bool = False,
651658
ancestry: bool = False,
659+
maf: float = None,
652660
output: Path = Path("-"),
653661
verbosity: str = "INFO",
654662
):
@@ -694,6 +702,7 @@ def transform(
694702
chunk_size,
695703
discard_missing,
696704
ancestry,
705+
maf,
697706
output,
698707
log,
699708
)

haptools/data/breakpoints.py

-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
from .data import Data
1414

15-
1615
# A haplotype block consists of
1716
# 1) pop - A population label (str), like 'YRI'
1817
# 2) chrom - A chromosome name (str), like 'chr19' or simply '19'

haptools/data/genotypes.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,7 @@ def check_biallelic(self, discard_also=False):
515515
self._var_idx = None
516516
else:
517517
raise ValueError(
518-
"Variant with ID {} at POS {}:{} is multiallelic for sample {}"
519-
.format(
518+
"Variant with ID {} at POS {}:{} is multiallelic for sample {}".format(
520519
*tuple(self.variants[variant_idx[0]])[:3],
521520
self.samples[samp_idx[0]],
522521
)

haptools/data/haplotypes.py

-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from .data import Data
1313
from .genotypes import GenotypesVCF
1414

15-
1615
# the current version of the hap format spec
1716
HAP_VERSION = "0.2.0"
1817

haptools/data/tr_harmonizer.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Utilities for harmonizing tandem repeat VCF records.
33
Handles VCFs generated by various TR genotyping tools
44
"""
5+
56
import re
67
import math
78
import enum
@@ -462,8 +463,7 @@ def _HarmonizeHipSTRRecord(vcfrecord: cyvcf2.Variant):
462463
):
463464
raise TypeError(
464465
"Record at {}:{} is missing one of the mandatory HipSTR info fields START,"
465-
" END, PERIOD. ".format(vcfrecord.CHROM, vcfrecord.POS)
466-
+ _beagle_error
466+
" END, PERIOD. ".format(vcfrecord.CHROM, vcfrecord.POS) + _beagle_error
467467
)
468468
# determine full alleles and trimmed alleles
469469
pos = int(vcfrecord.POS)
@@ -536,8 +536,7 @@ def _HarmonizeAdVNTRRecord(vcfrecord: cyvcf2.Variant):
536536
if vcfrecord.INFO.get("RU") is None or vcfrecord.INFO.get("VID") is None:
537537
raise TypeError(
538538
"Record at {}:{} is missing one of the mandatory ADVNTR info fields RU,"
539-
" VID. ".format(vcfrecord.CHROM, vcfrecord.POS)
540-
+ _beagle_error
539+
" VID. ".format(vcfrecord.CHROM, vcfrecord.POS) + _beagle_error
541540
)
542541
ref_allele = vcfrecord.REF.upper()
543542
if vcfrecord.ALT:
@@ -651,8 +650,7 @@ def _HarmonizeEHRecord(vcfrecord: cyvcf2.Variant):
651650
if vcfrecord.INFO.get("VARID") is None or vcfrecord.INFO.get("RU") is None:
652651
raise TypeError(
653652
"Record at {}:{} is missing one of the mandatory ExpansionHunter info"
654-
" fields VARID, RU. ".format(vcfrecord.CHROM, vcfrecord.POS)
655-
+ _beagle_error
653+
" fields VARID, RU. ".format(vcfrecord.CHROM, vcfrecord.POS) + _beagle_error
656654
)
657655
record_id = vcfrecord.INFO["VARID"]
658656
motif = vcfrecord.INFO["RU"].upper()

haptools/transform.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,7 @@ def check_biallelic(self, discard_also=False):
402402
self._var_idx = None
403403
else:
404404
raise ValueError(
405-
"Variant with ID {} at POS {}:{} is multiallelic for sample {}"
406-
.format(
405+
"Variant with ID {} at POS {}:{} is multiallelic for sample {}".format(
407406
*tuple(self.variants[variant_idx[0]])[:3],
408407
self.samples[samp_idx[0]],
409408
)
@@ -537,6 +536,7 @@ def transform_haps(
537536
chunk_size: int = None,
538537
discard_missing: bool = False,
539538
ancestry: bool = False,
539+
maf: float = None,
540540
output: Path = Path("-"),
541541
log: logging.Logger = None,
542542
):
@@ -571,6 +571,8 @@ def transform_haps(
571571
ancestry : bool, optional
572572
Whether to also match ancestral population labels from the VCF against those in
573573
the .hap file
574+
maf : float, optional
575+
If specified, only haplotypes with an MAF above this value will be output
574576
output : Path, optional
575577
The location to which to write output
576578
log : Logger, optional
@@ -666,6 +668,10 @@ def transform_haps(
666668
log.info("Transforming genotypes via haplotypes")
667669
hp.transform(gt, hp_gt)
668670

671+
if maf is not None:
672+
log.info(f"Removing haplotypes with MAF < {maf}")
673+
hp_gt.check_maf(threshold=maf, discard_also=True)
674+
669675
log.info(f"Writing haplotypes to {out_file_type} file")
670676
hp_gt.write()
671677

noxfile.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Nox sessions."""
2+
23
import os
34
import shutil
45
from pathlib import Path
@@ -7,7 +8,6 @@
78
from nox_poetry import Session # type: ignore
89
from nox_poetry import session # type: ignore
910

10-
1111
package = "haptools"
1212
python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
1313
locked_python_version = "3.9" # keep in sync with dev-env.yml
@@ -37,7 +37,7 @@ def docs(session: Session) -> None:
3737
def lint(session: Session) -> None:
3838
"""Lint our code."""
3939
session.install("black")
40-
session.run("black", "--verbose", "--check", ".")
40+
session.run("black", "--diff", "--verbose", "--check", ".")
4141

4242

4343
# detect whether conda/mamba is installed

0 commit comments

Comments
 (0)