bigbio · enriquea · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · enriquea
diff --git a/hvantk/algorithms/enrichex/burden.py b/hvantk/algorithms/enrichex/burden.py
@@ -281,7 +281,11 @@ def _build_gene_to_sets_ht(gene_sets: Dict[str, List[str]]) -> "hl.Table":
     """Build a ``gene -> [gene_set_ids]`` table from the gene-set definitions."""
     gene_to_sets: Dict[str, List[str]] = {}
     for gs_name, genes in gene_sets.items():
-        for gene in genes:
+        # dict.fromkeys dedups within a single set while preserving order, so a
+        # gene listed twice in one set is not double-counted after explode_rows
+        # (n_genes_found / burden). set() would lose order; the canonical
+        # parse_geneset_tsv already dedups upstream, so this guards other callers.
+        for gene in dict.fromkeys(genes):
             if gene not in gene_to_sets:
                 gene_to_sets[gene] = []
             gene_to_sets[gene].append(gs_name)
@@ -394,6 +398,16 @@ def compute_geneset_burden_mt(
     """
     _require_hail()
 
+    # Normalize each set's gene list (dedup within set, order-preserving) up
+    # front so the min_gene_set_size filter, gene_set_size, gene_coverage_pct,
+    # and the gene->set membership are all derived from the same deduped genes.
+    # No-op for the canonical pipeline (parse_geneset_tsv already dedups); this
+    # only affects callers passing raw, duplicate-containing lists, where the
+    # deduped membership would otherwise disagree with len()-based sizes.
+    gene_sets = {
+        name: list(dict.fromkeys(genes)) for name, genes in gene_sets.items()
+    }
+
     # Pre-filter gene sets by minimum size
     if min_gene_set_size > 0:
         n_before = len(gene_sets)

diff --git a/hvantk/algorithms/hgc/qc_report.py b/hvantk/algorithms/hgc/qc_report.py
@@ -57,8 +57,13 @@ def render_qc_summary_markdown(summary_data: Dict[str, dict]) -> str:
                     for stat in _SUMMARY_STAT_COLUMNS:
                         value = stats.get(stat, "N/A")
                         if isinstance(value, (int, float)) and stat != "count":
+                            # Use scientific notation for very small magnitudes
+                            # (e.g. tiny HWE p-values) so they don't collapse to
+                            # "0.000"; keep an exact 0 as "0.000".
                             value = (
-                                f"{value:.3f}" if abs(value) < 1000 else f"{value:.2e}"
+                                f"{value:.3f}"
+                                if value == 0 or 1e-3 <= abs(value) < 1000
+                                else f"{value:.2e}"
                             )
                         row += f" {value} |"
                     md_content += row + "\n"

diff --git a/hvantk/skills/clingen/catalog/datasets.json b/hvantk/skills/clingen/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "ClinGen_GeneDisease",
     "description": "Curated gene-disease associations with evidence-based classifications from ClinGen. Includes classification levels (Definitive, Strong, Moderate, Limited), mode of inheritance, and MONDO disease ontology mappings.",
     "pubmedid": "28552198",
-    "data_source": "Custom",
+    "data_source": "ClinGen",
     "last_updated": "2026-03-06T00:00:00.000000",
     "update_frequency": "monthly",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/clinvar/catalog/datasets.json b/hvantk/skills/clinvar/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "ClinVar_latest",
     "description": "Public archive of reports of the relationships among human variations and phenotypes, with supporting evidence. ClinVar facilitates access to and communication about the relationships asserted between human variation and observed health status.",
     "pubmedid": "24234437",
-    "data_source": "Custom",
+    "data_source": "ClinVar",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "monthly",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/dbnsfp/catalog/datasets.json b/hvantk/skills/dbnsfp/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "dbNSFP_v4.7",
     "description": "Comprehensive database of functional predictions and annotations for human nonsynonymous and splice-site SNVs. Includes prediction scores from SIFT, PolyPhen2, CADD, GERP++, PhyloP, PhastCons, and many others for variant pathogenicity assessment.",
     "pubmedid": "21520341",
-    "data_source": "Custom",
+    "data_source": "dbNSFP",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "quarterly",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/ensembl_gene/catalog/datasets.json b/hvantk/skills/ensembl_gene/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "Ensembl_v110",
     "description": "Comprehensive gene annotations from the Ensembl project including gene models, transcript isoforms, protein sequences, regulatory features, and comparative genomics data across species.",
     "pubmedid": "31691826",
-    "data_source": "Custom",
+    "data_source": "Ensembl",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "quarterly",
     "organism": "Homo sapiens",
@@ -35,7 +35,7 @@
       },
       {
         "path": "Homo_sapiens.GRCh38.110.gff3.gz",
-        "format": "gff",
+        "format": "gff3",
         "size_bytes": 900000000,
         "compression": "gzip",
         "description": "Complete gene annotations in GFF3 format"

diff --git a/hvantk/skills/gevir/catalog/datasets.json b/hvantk/skills/gevir/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "GeVIR_v1.0",
     "description": "Genome-wide scores for ranking the pathogenicity potential of human genetic variants. GeVIR integrates multiple genomic features to provide pathogenicity predictions for both coding and non-coding variants.",
     "pubmedid": "31873297",
-    "data_source": "Custom",
+    "data_source": "GeVIR",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "irregular",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/gnomad_metrics/catalog/datasets.json b/hvantk/skills/gnomad_metrics/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "gnomAD_v4.1",
     "description": "Large-scale reference dataset of human genetic variation, aggregating exome and genome sequencing data from diverse populations worldwide. Provides allele frequencies, quality metrics, and population-specific annotations.",
     "pubmedid": "32461654",
-    "data_source": "Custom",
+    "data_source": "gnomAD",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "annually",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/gtex_eqtl/catalog/datasets.json b/hvantk/skills/gtex_eqtl/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "GTEx_v11_eQTL_signif_pairs",
     "description": "GTEx Consortium release v11 cis-eQTL significant variant-gene pairs, one parquet file per tissue. 12 source columns: phenotype_id, variant_id (chr_pos_ref_alt_b38), start_distance, af, ma_samples, ma_count, pval_nominal, slope, slope_se, pval_nominal_threshold, min_pval_nominal, pval_beta. Built into a Hail Table keyed by (locus, alleles, gene_id) via spark.read.parquet \u2192 hl.Table.from_spark; see hvantk/skills/gtex_eqtl/SKILL.md. Used as eQTL input to hvantk/qtlcascade/ and variant annotation.",
     "pubmedid": "32913098",
-    "data_source": "Custom",
+    "data_source": "GTEx",
     "last_updated": "2024-12-01T00:00:00.000000",
     "update_frequency": "irregular",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/gwas_catalog/catalog/datasets.json b/hvantk/skills/gwas_catalog/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "GWAS_Catalog_v1.0_e115_r2026-04-27",
     "description": "Manually curated collection of all published genome-wide association studies. Each row is a SNP-trait association with study provenance, p-value, effect size (OR or BETA), and risk-allele frequency. v1.0 schema (34 columns, no MAPPED_TRAIT_URI). Built into a Hail Table keyed by (locus, alleles) with a sentinel ALT for variant-annotation joins; see hvantk/skills/gwas_catalog/SKILL.md.",
     "pubmedid": "30445434",
-    "data_source": "Custom",
+    "data_source": "GWAS Catalog",
     "last_updated": "2026-04-27T00:00:00.000000",
     "update_frequency": "quarterly",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/insider/catalog/datasets.json b/hvantk/skills/insider/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "INSIDER_v1.0",
     "description": "Interactome Insider (Wei et al., Nat Methods 2017). Predicted (ECLAIR), structural (PDB), and homology-derived (I3D) protein-protein interaction interface residues. Distributed as two complementary products: (1) Whole_Human_Interactome_Interface_hg38.bed \u2014 UCSC-style BED with 208,448 named PPI tracks projecting interface residues onto GRCh38 coordinates (this file path), and (2) H_sapiens_interfacesALL.txt \u2014 per protein-pair residue arrays with Source provenance (a separately-onboardable product, not in this catalog entry). Built by a custom track-aware BED parser plus hl.import_table into an interval-keyed Hail Table that preserves PPI IDs from track names; see hvantk/skills/insider/SKILL.md. Used for variant-interval intersection (does a variant fall in any predicted PPI interface?).",
     "pubmedid": "29036289",
-    "data_source": "Custom",
+    "data_source": "INSIDER",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "irregular",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/msigdb/catalog/datasets.json b/hvantk/skills/msigdb/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "MSigDB_C2_CP_v2026.1.Hs.symbols",
     "description": "Broad Institute Molecular Signatures Database. Collection C2 (curated gene sets) sub-collection CP (Canonical Pathways) \u2014 pathway gene sets curated from online resources (BIOCARTA, KEGG, PID, REACTOME, WP, SA). Human gene-symbol variant. GMT format: tab-separated variable-width rows where column 1 is set_name, column 2 is a gsea-msigdb URL, and columns 3..N are gene symbols. Built into a Hail Table keyed by set_name with genes as array<str>; see hvantk/skills/msigdb/SKILL.md. Used for gene-set lookup in enrichment / burden / overlap analyses.",
     "pubmedid": "16199517",
-    "data_source": "Custom",
+    "data_source": "MSigDB",
     "last_updated": "2026-01-01T00:00:00.000000",
     "update_frequency": "annually",
     "organism": "Homo sapiens",

diff --git a/hvantk/tests/enrichex/test_burden_hail.py b/hvantk/tests/enrichex/test_burden_hail.py
@@ -10,6 +10,7 @@
 
 from hvantk.algorithms.enrichex.burden import (
     VariantFilter,
+    _build_gene_to_sets_ht,
     _compute_test_statistic,
     _linear_t_statistic,
     _logistic_z_statistic,
@@ -182,6 +183,38 @@ def test_compute_geneset_burden_invalid_method(self, hail_session):
                 genotype_aggregation="invalid",
             )
 
+    def test_compute_geneset_burden_dedups_gene_set_size(self, hail_session):
+        """gene_set_size must reflect the DEDUPED gene count so it stays
+        consistent with n_genes_found / gene_coverage_pct when a caller passes
+        duplicate genes within a set (PR #187 review: Copilot)."""
+        mt = self.setup_test_mt()
+        gene_sets = {"set1": ["GENE1", "GENE1", "GENE2"]}  # GENE1 duplicated
+        mt_burden = compute_geneset_burden_mt(mt, gene_sets, gene_field="SYMBOL")
+        row = mt_burden.rows().collect()[0]
+        assert row.gene_set_size == 2  # deduped (GENE1, GENE2), not 3
+
+
+@pytest.mark.hail
+class TestBuildGeneToSetsHt:
+    """Regression tests for _build_gene_to_sets_ht (gene -> [set ids])."""
+
+    def test_dedups_repeated_gene_within_a_set(self, hail_session):
+        """A gene listed twice in one set maps to that set once, not twice —
+        otherwise it is double-counted after explode_rows (n_genes_found /
+        burden). Regression for #185."""
+        ht = _build_gene_to_sets_ht({"panel_a": ["BRCA1", "BRCA1", "TP53"]})
+        rows = {r["gene"]: list(r["gene_set_ids"]) for r in ht.collect()}
+        assert rows["BRCA1"] == ["panel_a"]  # not ["panel_a", "panel_a"]
+        assert rows["TP53"] == ["panel_a"]
+
+    def test_keeps_distinct_sets_for_a_shared_gene(self, hail_session):
+        """Within-set dedup must not collapse a gene's membership across sets."""
+        ht = _build_gene_to_sets_ht(
+            {"panel_a": ["BRCA1", "BRCA1"], "panel_b": ["BRCA1"]}
+        )
+        rows = {r["gene"]: list(r["gene_set_ids"]) for r in ht.collect()}
+        assert rows["BRCA1"] == ["panel_a", "panel_b"]
+
 
 @pytest.mark.hail
 class TestLogisticBurdenTest:

diff --git a/hvantk/tools/enrichex/burden_cli.py b/hvantk/tools/enrichex/burden_cli.py
@@ -42,7 +42,7 @@ def _render_burden_summary(result_df, alpha, phenotype_type: str) -> None:
     click.echo(f"Significant gene sets (p_adj < {alpha}): {n_significant}")
 
     if n_significant > 0:
-        click.echo(f"\nTop significant gene sets:")
+        click.echo("\nTop significant gene sets:")
         click.echo("-" * 60)
 
         # Select columns based on phenotype type
@@ -434,13 +434,13 @@ def burden_test(
         click.echo("\n" + "=" * 60)
         click.echo("ENRICHEX BURDEN ANALYSIS PLAN (Hail-Native)")
         click.echo("=" * 60)
-        click.echo(f"\nInputs:")
+        click.echo("\nInputs:")
         click.echo(f"  Cohort MT: {cohort_mt}")
         click.echo(f"  Phenotypes: {phenotypes}")
         click.echo(f"    Field: {phenotype_field}")
         click.echo(f"    Type: {phenotype_type}")
         click.echo(f"  Gene sets: {gene_sets}")
-        click.echo(f"\nVariant Filters:")
+        click.echo("\nVariant Filters:")
         if max_af is not None:
             click.echo(f"  Max AF: {max_af} (field: {af_field})")
         if min_score is not None:
@@ -450,18 +450,18 @@ def burden_test(
         if consequences:
             click.echo(f"  Consequences: {consequences} (field: {consequence_field})")
         if max_af is None and min_score is None and not consequences:
-            click.echo(f"  None (MT assumed pre-filtered)")
-        click.echo(f"\nAnalysis:")
+            click.echo("  None (MT assumed pre-filtered)")
+        click.echo("\nAnalysis:")
         click.echo(f"  Gene field: {gene_field}")
         click.echo(f"  Genotype aggregation: {genotype_aggregation}")
         if covariates:
             click.echo(f"  Covariates: {covariates}")
         if normalize_by_length:
-            click.echo(f"  Normalize by length: Yes")
+            click.echo("  Normalize by length: Yes")
             if gene_lengths:
                 click.echo(f"  Gene lengths file: {gene_lengths}")
             else:
-                click.echo(f"  Gene lengths: using variant site count proxy")
+                click.echo("  Gene lengths: using variant site count proxy")
         if min_carriers > 0:
             click.echo(f"  Min carriers: {min_carriers}")
         if variant_classes:

diff --git a/hvantk/tools/plugins/plugins_cli.py b/hvantk/tools/plugins/plugins_cli.py
@@ -82,9 +82,16 @@ def validate_cmd(manifest_path: str):
             catalog_schema = json.loads(
                 (Path(plugin_loader.__file__).parent / "catalog_entry.schema.json").read_text()
             )
+        except (OSError, json.JSONDecodeError) as exc:
+            raise click.ClickException(
+                f"failed to read bundled catalog schema: {exc}"
+            ) from exc
+        try:
             entries = json.loads(catalog_path.read_text())
         except (OSError, json.JSONDecodeError) as exc:
-            raise click.ClickException(f"failed to read catalog {catalog_path}: {exc}")
+            raise click.ClickException(
+                f"failed to read catalog {catalog_path}: {exc}"
+            ) from exc
         if not isinstance(entries, list):
             raise click.ClickException(f"catalog must be a JSON array: {catalog_path}")
         errors: list[str] = []