Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion hvantk/algorithms/enrichex/burden.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,11 @@ def _build_gene_to_sets_ht(gene_sets: Dict[str, List[str]]) -> "hl.Table":
"""Build a ``gene -> [gene_set_ids]`` table from the gene-set definitions."""
gene_to_sets: Dict[str, List[str]] = {}
for gs_name, genes in gene_sets.items():
for gene in genes:
# dict.fromkeys dedups within a single set while preserving order, so a
# gene listed twice in one set is not double-counted after explode_rows
# (n_genes_found / burden). set() would lose order; the canonical
# parse_geneset_tsv already dedups upstream, so this guards other callers.
for gene in dict.fromkeys(genes):
Comment on lines +284 to +288

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Valid — verified in compute_geneset_burden_mt: min_gene_set_size filtered on raw len(v) (line 404), gene_set_size was len(genes) (line 537), and gene_coverage_pct divided by that raw size (line 552), while n_genes_found came from the deduped membership — so a non-deduplicated caller would get inconsistent size/coverage.

Fixed in 9c3ce41 by normalizing each set's gene list once at the entry of compute_geneset_burden_mt (gene_sets = {name: list(dict.fromkeys(genes)) …}), so the min-size filter, gene_set_size, gene_coverage_pct, and the gene→set membership all derive from the same deduped genes. It's a no-op for the canonical pipeline (parse_geneset_tsv already dedups). Kept the _build_gene_to_sets_ht dedup as defense for direct callers, and added a regression test asserting gene_set_size reflects the deduped count.

if gene not in gene_to_sets:
gene_to_sets[gene] = []
gene_to_sets[gene].append(gs_name)
Expand Down Expand Up @@ -394,6 +398,16 @@ def compute_geneset_burden_mt(
"""
_require_hail()

# Normalize each set's gene list (dedup within set, order-preserving) up
# front so the min_gene_set_size filter, gene_set_size, gene_coverage_pct,
# and the gene->set membership are all derived from the same deduped genes.
# No-op for the canonical pipeline (parse_geneset_tsv already dedups); this
# only affects callers passing raw, duplicate-containing lists, where the
# deduped membership would otherwise disagree with len()-based sizes.
gene_sets = {
name: list(dict.fromkeys(genes)) for name, genes in gene_sets.items()
}

# Pre-filter gene sets by minimum size
if min_gene_set_size > 0:
n_before = len(gene_sets)
Expand Down
7 changes: 6 additions & 1 deletion hvantk/algorithms/hgc/qc_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,13 @@ def render_qc_summary_markdown(summary_data: Dict[str, dict]) -> str:
for stat in _SUMMARY_STAT_COLUMNS:
value = stats.get(stat, "N/A")
if isinstance(value, (int, float)) and stat != "count":
# Use scientific notation for very small magnitudes
# (e.g. tiny HWE p-values) so they don't collapse to
# "0.000"; keep an exact 0 as "0.000".
value = (
f"{value:.3f}" if abs(value) < 1000 else f"{value:.2e}"
f"{value:.3f}"
if value == 0 or 1e-3 <= abs(value) < 1000
else f"{value:.2e}"
)
row += f" {value} |"
md_content += row + "\n"
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/clingen/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "ClinGen_GeneDisease",
"description": "Curated gene-disease associations with evidence-based classifications from ClinGen. Includes classification levels (Definitive, Strong, Moderate, Limited), mode of inheritance, and MONDO disease ontology mappings.",
"pubmedid": "28552198",
"data_source": "Custom",
"data_source": "ClinGen",
"last_updated": "2026-03-06T00:00:00.000000",
"update_frequency": "monthly",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/clinvar/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "ClinVar_latest",
"description": "Public archive of reports of the relationships among human variations and phenotypes, with supporting evidence. ClinVar facilitates access to and communication about the relationships asserted between human variation and observed health status.",
"pubmedid": "24234437",
"data_source": "Custom",
"data_source": "ClinVar",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "monthly",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/dbnsfp/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "dbNSFP_v4.7",
"description": "Comprehensive database of functional predictions and annotations for human nonsynonymous and splice-site SNVs. Includes prediction scores from SIFT, PolyPhen2, CADD, GERP++, PhyloP, PhastCons, and many others for variant pathogenicity assessment.",
"pubmedid": "21520341",
"data_source": "Custom",
"data_source": "dbNSFP",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "quarterly",
"organism": "Homo sapiens",
Expand Down
4 changes: 2 additions & 2 deletions hvantk/skills/ensembl_gene/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "Ensembl_v110",
"description": "Comprehensive gene annotations from the Ensembl project including gene models, transcript isoforms, protein sequences, regulatory features, and comparative genomics data across species.",
"pubmedid": "31691826",
"data_source": "Custom",
"data_source": "Ensembl",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "quarterly",
"organism": "Homo sapiens",
Expand Down Expand Up @@ -35,7 +35,7 @@
},
{
"path": "Homo_sapiens.GRCh38.110.gff3.gz",
"format": "gff",
"format": "gff3",
"size_bytes": 900000000,
"compression": "gzip",
"description": "Complete gene annotations in GFF3 format"
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/gevir/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "GeVIR_v1.0",
"description": "Genome-wide scores for ranking the pathogenicity potential of human genetic variants. GeVIR integrates multiple genomic features to provide pathogenicity predictions for both coding and non-coding variants.",
"pubmedid": "31873297",
"data_source": "Custom",
"data_source": "GeVIR",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "irregular",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/gnomad_metrics/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "gnomAD_v4.1",
"description": "Large-scale reference dataset of human genetic variation, aggregating exome and genome sequencing data from diverse populations worldwide. Provides allele frequencies, quality metrics, and population-specific annotations.",
"pubmedid": "32461654",
"data_source": "Custom",
"data_source": "gnomAD",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "annually",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/gtex_eqtl/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "GTEx_v11_eQTL_signif_pairs",
"description": "GTEx Consortium release v11 cis-eQTL significant variant-gene pairs, one parquet file per tissue. 12 source columns: phenotype_id, variant_id (chr_pos_ref_alt_b38), start_distance, af, ma_samples, ma_count, pval_nominal, slope, slope_se, pval_nominal_threshold, min_pval_nominal, pval_beta. Built into a Hail Table keyed by (locus, alleles, gene_id) via spark.read.parquet \u2192 hl.Table.from_spark; see hvantk/skills/gtex_eqtl/SKILL.md. Used as eQTL input to hvantk/qtlcascade/ and variant annotation.",
"pubmedid": "32913098",
"data_source": "Custom",
"data_source": "GTEx",
"last_updated": "2024-12-01T00:00:00.000000",
"update_frequency": "irregular",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/gwas_catalog/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "GWAS_Catalog_v1.0_e115_r2026-04-27",
"description": "Manually curated collection of all published genome-wide association studies. Each row is a SNP-trait association with study provenance, p-value, effect size (OR or BETA), and risk-allele frequency. v1.0 schema (34 columns, no MAPPED_TRAIT_URI). Built into a Hail Table keyed by (locus, alleles) with a sentinel ALT for variant-annotation joins; see hvantk/skills/gwas_catalog/SKILL.md.",
"pubmedid": "30445434",
"data_source": "Custom",
"data_source": "GWAS Catalog",
"last_updated": "2026-04-27T00:00:00.000000",
"update_frequency": "quarterly",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/insider/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "INSIDER_v1.0",
"description": "Interactome Insider (Wei et al., Nat Methods 2017). Predicted (ECLAIR), structural (PDB), and homology-derived (I3D) protein-protein interaction interface residues. Distributed as two complementary products: (1) Whole_Human_Interactome_Interface_hg38.bed \u2014 UCSC-style BED with 208,448 named PPI tracks projecting interface residues onto GRCh38 coordinates (this file path), and (2) H_sapiens_interfacesALL.txt \u2014 per protein-pair residue arrays with Source provenance (a separately-onboardable product, not in this catalog entry). Built by a custom track-aware BED parser plus hl.import_table into an interval-keyed Hail Table that preserves PPI IDs from track names; see hvantk/skills/insider/SKILL.md. Used for variant-interval intersection (does a variant fall in any predicted PPI interface?).",
"pubmedid": "29036289",
"data_source": "Custom",
"data_source": "INSIDER",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "irregular",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/msigdb/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "MSigDB_C2_CP_v2026.1.Hs.symbols",
"description": "Broad Institute Molecular Signatures Database. Collection C2 (curated gene sets) sub-collection CP (Canonical Pathways) \u2014 pathway gene sets curated from online resources (BIOCARTA, KEGG, PID, REACTOME, WP, SA). Human gene-symbol variant. GMT format: tab-separated variable-width rows where column 1 is set_name, column 2 is a gsea-msigdb URL, and columns 3..N are gene symbols. Built into a Hail Table keyed by set_name with genes as array<str>; see hvantk/skills/msigdb/SKILL.md. Used for gene-set lookup in enrichment / burden / overlap analyses.",
"pubmedid": "16199517",
"data_source": "Custom",
"data_source": "MSigDB",
"last_updated": "2026-01-01T00:00:00.000000",
"update_frequency": "annually",
"organism": "Homo sapiens",
Expand Down
33 changes: 33 additions & 0 deletions hvantk/tests/enrichex/test_burden_hail.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from hvantk.algorithms.enrichex.burden import (
VariantFilter,
_build_gene_to_sets_ht,
_compute_test_statistic,
_linear_t_statistic,
_logistic_z_statistic,
Expand Down Expand Up @@ -182,6 +183,38 @@ def test_compute_geneset_burden_invalid_method(self, hail_session):
genotype_aggregation="invalid",
)

def test_compute_geneset_burden_dedups_gene_set_size(self, hail_session):
"""gene_set_size must reflect the DEDUPED gene count so it stays
consistent with n_genes_found / gene_coverage_pct when a caller passes
duplicate genes within a set (PR #187 review: Copilot)."""
mt = self.setup_test_mt()
gene_sets = {"set1": ["GENE1", "GENE1", "GENE2"]} # GENE1 duplicated
mt_burden = compute_geneset_burden_mt(mt, gene_sets, gene_field="SYMBOL")
row = mt_burden.rows().collect()[0]
assert row.gene_set_size == 2 # deduped (GENE1, GENE2), not 3


@pytest.mark.hail
class TestBuildGeneToSetsHt:
"""Regression tests for _build_gene_to_sets_ht (gene -> [set ids])."""

def test_dedups_repeated_gene_within_a_set(self, hail_session):
"""A gene listed twice in one set maps to that set once, not twice —
otherwise it is double-counted after explode_rows (n_genes_found /
burden). Regression for #185."""
ht = _build_gene_to_sets_ht({"panel_a": ["BRCA1", "BRCA1", "TP53"]})
rows = {r["gene"]: list(r["gene_set_ids"]) for r in ht.collect()}
assert rows["BRCA1"] == ["panel_a"] # not ["panel_a", "panel_a"]
assert rows["TP53"] == ["panel_a"]

def test_keeps_distinct_sets_for_a_shared_gene(self, hail_session):
"""Within-set dedup must not collapse a gene's membership across sets."""
ht = _build_gene_to_sets_ht(
{"panel_a": ["BRCA1", "BRCA1"], "panel_b": ["BRCA1"]}
)
rows = {r["gene"]: list(r["gene_set_ids"]) for r in ht.collect()}
assert rows["BRCA1"] == ["panel_a", "panel_b"]


@pytest.mark.hail
class TestLogisticBurdenTest:
Expand Down
14 changes: 7 additions & 7 deletions hvantk/tools/enrichex/burden_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _render_burden_summary(result_df, alpha, phenotype_type: str) -> None:
click.echo(f"Significant gene sets (p_adj < {alpha}): {n_significant}")

if n_significant > 0:
click.echo(f"\nTop significant gene sets:")
click.echo("\nTop significant gene sets:")
click.echo("-" * 60)

# Select columns based on phenotype type
Expand Down Expand Up @@ -434,13 +434,13 @@ def burden_test(
click.echo("\n" + "=" * 60)
click.echo("ENRICHEX BURDEN ANALYSIS PLAN (Hail-Native)")
click.echo("=" * 60)
click.echo(f"\nInputs:")
click.echo("\nInputs:")
click.echo(f" Cohort MT: {cohort_mt}")
click.echo(f" Phenotypes: {phenotypes}")
click.echo(f" Field: {phenotype_field}")
click.echo(f" Type: {phenotype_type}")
click.echo(f" Gene sets: {gene_sets}")
click.echo(f"\nVariant Filters:")
click.echo("\nVariant Filters:")
if max_af is not None:
click.echo(f" Max AF: {max_af} (field: {af_field})")
if min_score is not None:
Expand All @@ -450,18 +450,18 @@ def burden_test(
if consequences:
click.echo(f" Consequences: {consequences} (field: {consequence_field})")
if max_af is None and min_score is None and not consequences:
click.echo(f" None (MT assumed pre-filtered)")
click.echo(f"\nAnalysis:")
click.echo(" None (MT assumed pre-filtered)")
click.echo("\nAnalysis:")
click.echo(f" Gene field: {gene_field}")
click.echo(f" Genotype aggregation: {genotype_aggregation}")
if covariates:
click.echo(f" Covariates: {covariates}")
if normalize_by_length:
click.echo(f" Normalize by length: Yes")
click.echo(" Normalize by length: Yes")
if gene_lengths:
click.echo(f" Gene lengths file: {gene_lengths}")
else:
click.echo(f" Gene lengths: using variant site count proxy")
click.echo(" Gene lengths: using variant site count proxy")
if min_carriers > 0:
click.echo(f" Min carriers: {min_carriers}")
if variant_classes:
Expand Down
9 changes: 8 additions & 1 deletion hvantk/tools/plugins/plugins_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,16 @@ def validate_cmd(manifest_path: str):
catalog_schema = json.loads(
(Path(plugin_loader.__file__).parent / "catalog_entry.schema.json").read_text()
)
except (OSError, json.JSONDecodeError) as exc:
raise click.ClickException(
f"failed to read bundled catalog schema: {exc}"
) from exc
try:
entries = json.loads(catalog_path.read_text())
except (OSError, json.JSONDecodeError) as exc:
raise click.ClickException(f"failed to read catalog {catalog_path}: {exc}")
raise click.ClickException(
f"failed to read catalog {catalog_path}: {exc}"
) from exc
if not isinstance(entries, list):
raise click.ClickException(f"catalog must be a JSON array: {catalog_path}")
errors: list[str] = []
Expand Down
Loading