bigbio · enriquea · Jun 8, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 8, 2026
diff --git a/.github/scripts/drift_to_pr.py b/.github/scripts/drift_to_pr.py
@@ -11,7 +11,7 @@
     [
       {
         "dataset_name": "clinvar:variants",
-        "status": "clean" | "drifted" | "probe_failed",
+        "status": "clean" | "drifted" | "probe_failed" | "stub",
         "observed": {...} | null,
         "expected": {...} | null,
         "diff": {...} | null,
@@ -32,7 +32,10 @@
 6. Opens a draft PR (or updates the body of an existing one).
 
 Probe-failed entries are recorded in the GitHub step summary but never
-produce a PR (they are infrastructure failures, not data drift).
+produce a PR (they are infrastructure failures, not data drift). ``stub``
+entries (documentation-only sources with no programmatic probe) are likewise
+never turned into PRs; they are surfaced in the summary count so they are not
+silently dropped.
 
 Exit code is always 0 unless a wholly unexpected error escapes; drift /
 probe-failed are not workflow failures.
@@ -366,6 +369,26 @@ def handle_probe_failed(
     )
 
 
+def handle_stub(
+    entry: dict,
+    *,
+    step_summary: Path | None,
+) -> None:
+    """Surface a documentation-only stub in the rendered step summary so it is
+    not silently dropped. Stubs never open a PR (they are not data drift) and
+    never fail the workflow (they exit clean)."""
+    dataset = entry["dataset_name"]
+    reason = (entry.get("observed") or {}).get(
+        "reason", "documentation-only source; no programmatic probe"
+    )
+    print(f"\n=== Stub (no real drift detection): {dataset} ===")
+    print(f"  {reason}")
+    _summary_line(
+        step_summary,
+        f"- STUB: `{dataset}` -- {reason}",
+    )
+
+
 def _summary_line(step_summary: Path | None, line: str) -> None:
     if step_summary is None:
         return
@@ -427,10 +450,11 @@ def main(argv: list[str] | None = None) -> int:
     drifted = [e for e in report if e.get("status") == "drifted"]
     probe_failed = [e for e in report if e.get("status") == "probe_failed"]
     clean = [e for e in report if e.get("status") == "clean"]
+    stub = [e for e in report if e.get("status") == "stub"]
 
     print(
         f"summary: {len(drifted)} drifted, {len(probe_failed)} probe_failed, "
-        f"{len(clean)} clean"
+        f"{len(clean)} clean, {len(stub)} stub"
     )
 
     for entry in drifted:
@@ -458,6 +482,9 @@ def main(argv: list[str] | None = None) -> int:
     for entry in probe_failed:
         handle_probe_failed(entry, step_summary=step_summary)
 
+    for entry in stub:
+        handle_stub(entry, step_summary=step_summary)
+
     return 0
 
 

diff --git a/hvantk/algorithms/enrichex/burden.py b/hvantk/algorithms/enrichex/burden.py
@@ -281,7 +281,11 @@ def _build_gene_to_sets_ht(gene_sets: Dict[str, List[str]]) -> "hl.Table":
     """Build a ``gene -> [gene_set_ids]`` table from the gene-set definitions."""
     gene_to_sets: Dict[str, List[str]] = {}
     for gs_name, genes in gene_sets.items():
-        for gene in genes:
+        # dict.fromkeys dedups within a single set while preserving order, so a
+        # gene listed twice in one set is not double-counted after explode_rows
+        # (n_genes_found / burden). set() would lose order; the canonical
+        # parse_geneset_tsv already dedups upstream, so this guards other callers.
+        for gene in dict.fromkeys(genes):
             if gene not in gene_to_sets:
                 gene_to_sets[gene] = []
             gene_to_sets[gene].append(gs_name)
@@ -394,6 +398,16 @@ def compute_geneset_burden_mt(
     """
     _require_hail()
 
+    # Normalize each set's gene list (dedup within set, order-preserving) up
+    # front so the min_gene_set_size filter, gene_set_size, gene_coverage_pct,
+    # and the gene->set membership are all derived from the same deduped genes.
+    # No-op for the canonical pipeline (parse_geneset_tsv already dedups); this
+    # only affects callers passing raw, duplicate-containing lists, where the
+    # deduped membership would otherwise disagree with len()-based sizes.
+    gene_sets = {
+        name: list(dict.fromkeys(genes)) for name, genes in gene_sets.items()
+    }
+
     # Pre-filter gene sets by minimum size
     if min_gene_set_size > 0:
         n_before = len(gene_sets)

diff --git a/hvantk/algorithms/hgc/qc_report.py b/hvantk/algorithms/hgc/qc_report.py
@@ -57,8 +57,13 @@ def render_qc_summary_markdown(summary_data: Dict[str, dict]) -> str:
                     for stat in _SUMMARY_STAT_COLUMNS:
                         value = stats.get(stat, "N/A")
                         if isinstance(value, (int, float)) and stat != "count":
+                            # Use scientific notation for very small magnitudes
+                            # (e.g. tiny HWE p-values) so they don't collapse to
+                            # "0.000"; keep an exact 0 as "0.000".
                             value = (
-                                f"{value:.3f}" if abs(value) < 1000 else f"{value:.2e}"
+                                f"{value:.3f}"
+                                if value == 0 or 1e-3 <= abs(value) < 1000
+                                else f"{value:.2e}"
                             )
                         row += f" {value} |"
                     md_content += row + "\n"

diff --git a/hvantk/core/plugin/api.py b/hvantk/core/plugin/api.py
@@ -21,6 +21,33 @@
 # not flip drift status or invalidate stored artifact fingerprints.
 PROBE_FINGERPRINT_IGNORED_KEYS = frozenset({"fetched_at", "probe_version"})
 
+# Sentinel value for ``probe_status`` marking a drift probe as an intentional
+# stub: a documentation-only / license-gated / publication-only source with no
+# stable, programmatically-probeable direct URL. ``drift_runner`` reports these
+# as status="stub" (a visible WARNING) instead of a silent false-green "clean"
+# or a misleading "probe_failed". See issue #177.
+PROBE_STATUS_STUB = "stub"
+
+# Honest provenance token recorded by ``run_builder._coerce_fingerprint`` for a
+# stubbed source (the ``fingerprint`` key wins there). Self-describing rather
+# than a fake ``sha256:...`` hash, so provenance never implies a real probe ran.
+STUB_FINGERPRINT_TOKEN = "stub:no-programmatic-source"
+
+
+def stub_fingerprint(reason: str) -> dict:
+    """Build the structured sentinel a documentation-only drift probe returns.
+
+    Use this from ``fetch_fingerprint()`` when the source cannot be fingerprinted
+    programmatically (manual/gated/publication-only acquisition). ``reason``
+    explains why (surfaced in the ``hvantk drift`` WARNING). Returning this marks
+    the dataset as status="stub" rather than producing a false-green drift result.
+    """
+    return {
+        "probe_status": PROBE_STATUS_STUB,
+        "reason": reason,
+        "fingerprint": STUB_FINGERPRINT_TOKEN,
+    }
+
 
 class Builder(Protocol):
     """Phase B builder contract used by ``DatasetSpec.builder``.

diff --git a/hvantk/core/plugin/drift_runner.py b/hvantk/core/plugin/drift_runner.py
@@ -10,13 +10,18 @@
 from pathlib import Path
 from typing import Any
 
-from .api import DatasetSpec, DriftProbeError, PROBE_FINGERPRINT_IGNORED_KEYS
+from .api import (
+    DatasetSpec,
+    DriftProbeError,
+    PROBE_FINGERPRINT_IGNORED_KEYS,
+    PROBE_STATUS_STUB,
+)
 
 
 @dataclass(frozen=True)
 class DriftResult:
     dataset_name: str
-    status: str  # clean | drifted | probe_failed
+    status: str  # clean | drifted | probe_failed | stub
     observed: dict[str, Any] | None = None
     expected: dict[str, Any] | None = None
     diff: dict[str, Any] | None = None
@@ -32,38 +37,61 @@ def run_drift_check(dataset_name: str, *, timeout: int = 60) -> DriftResult:
     return _run_drift_check_with_spec(spec, timeout=timeout)
 
 
+def _probe_failed(spec: DatasetSpec, exc: DriftProbeError) -> DriftResult:
+    """Build a probe_failed result, also flagging a missing baseline fingerprint.
+
+    The probe runs before the baseline is read (so intentional stubs classify
+    correctly). Without this, a plugin whose probe fails AND ships no committed
+    baseline would surface only the probe error and silently hide the
+    missing-fingerprint configuration issue. The underlying probe error is
+    preserved either way.
+    """
+    fp_path = Path(spec.test_paths.drift_fingerprint)
+    if not fp_path.exists():
+        exc = DriftProbeError(
+            f"{exc}; additionally, expected fingerprint is missing at {fp_path}"
+        )
+    return DriftResult(
+        dataset_name=spec.name, status="probe_failed", probe_error=exc
+    )
+
+
 def _run_drift_check_with_spec(
     spec: DatasetSpec, *, timeout: int = 60
 ) -> DriftResult:
+    # Invoke the probe before loading the baseline so an intentional stub
+    # (documentation-only source with no probeable URL) is reported as
+    # status="stub" — these plugins ship no committed baseline, so a
+    # baseline-first ordering would mislabel them as "probe_failed".
+    try:
+        observed = _invoke_with_timeout(spec.drift_probe, timeout=timeout)
+    except DriftProbeError as exc:
+        return _probe_failed(spec, exc)
+    except Exception as exc:  # noqa: BLE001
+        return _probe_failed(
+            spec, DriftProbeError(f"probe raised {type(exc).__name__}: {exc}")
+        )
+
+    if observed.get("probe_status") == PROBE_STATUS_STUB:
+        return DriftResult(
+            dataset_name=spec.name,
+            status="stub",
+            observed=observed,
+        )
+
     fp_path = Path(spec.test_paths.drift_fingerprint)
     try:
         expected = json.loads(fp_path.read_text())
     except FileNotFoundError:
         return DriftResult(
             dataset_name=spec.name,
             status="probe_failed",
+            observed=observed,
             probe_error=DriftProbeError(
                 f"missing expected fingerprint at {fp_path}"
             ),
         )
 
-    try:
-        observed = _invoke_with_timeout(spec.drift_probe, timeout=timeout)
-    except DriftProbeError as exc:
-        return DriftResult(
-            dataset_name=spec.name,
-            status="probe_failed",
-            expected=expected,
-            probe_error=exc,
-        )
-    except Exception as exc:  # noqa: BLE001
-        return DriftResult(
-            dataset_name=spec.name,
-            status="probe_failed",
-            expected=expected,
-            probe_error=DriftProbeError(f"probe raised {type(exc).__name__}: {exc}"),
-        )
-
     diff = _compare_fingerprints(expected, observed)
     if diff is None:
         return DriftResult(

diff --git a/hvantk/skills/alphagenome/drift_probe.py b/hvantk/skills/alphagenome/drift_probe.py
@@ -1,16 +1,20 @@
-"""Drift probe stub for alphagenome.
+"""Drift probe for alphagenome — documentation-only source (stub).
 
-Phase K plugin promotion lifts this source from the legacy hardcoded
-_TABLE_BUILDERS dict. A real drift probe (which queries the upstream
-source and returns a fingerprint) is a follow-up — for now this stub
-returns a placeholder fingerprint.
+AlphaGenome is consumed as a live prediction API requiring credentials; there is
+no static data file with a stable, programmatically-probeable URL to fingerprint.
+This probe returns a structured stub sentinel so ``hvantk drift`` reports a
+visible WARNING (status="stub") rather than a silent false-green. Replace with a
+real probe if a direct data URL becomes available. See issue #177.
 """
 from __future__ import annotations
 
+from hvantk.core.plugin.api import stub_fingerprint
+
+_REASON = (
+    "AlphaGenome is a live prediction API requiring credentials; "
+    "no static data file to fingerprint"
+)
+
 
 def fetch_fingerprint() -> dict:
-    return {
-        "fingerprint": "sha256:phase-k-stub-not-implemented",
-        "probe_status": "stub",
-        "comment": "Phase K plugin promotion stub; replace with real probe.",
-    }
+    return stub_fingerprint(_REASON)
diff --git a/hvantk/skills/clingen/catalog/datasets.json b/hvantk/skills/clingen/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "ClinGen_GeneDisease",
     "description": "Curated gene-disease associations with evidence-based classifications from ClinGen. Includes classification levels (Definitive, Strong, Moderate, Limited), mode of inheritance, and MONDO disease ontology mappings.",
     "pubmedid": "28552198",
-    "data_source": "Custom",
+    "data_source": "ClinGen",
     "last_updated": "2026-03-06T00:00:00.000000",
     "update_frequency": "monthly",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/clinvar/catalog/datasets.json b/hvantk/skills/clinvar/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "ClinVar_latest",
     "description": "Public archive of reports of the relationships among human variations and phenotypes, with supporting evidence. ClinVar facilitates access to and communication about the relationships asserted between human variation and observed health status.",
     "pubmedid": "24234437",
-    "data_source": "Custom",
+    "data_source": "ClinVar",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "monthly",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/cosmic_cgc/drift_probe.py b/hvantk/skills/cosmic_cgc/drift_probe.py
@@ -1,16 +1,20 @@
-"""Drift probe stub for cosmic-cgc.
+"""Drift probe for cosmic-cgc — documentation-only source (stub).
 
-Phase K plugin promotion lifts this source from the legacy hardcoded
-_TABLE_BUILDERS dict. A real drift probe (which queries the upstream
-source and returns a fingerprint) is a follow-up — for now this stub
-returns a placeholder fingerprint.
+The COSMIC Cancer Gene Census is behind a login/license gate
+(cancer.sanger.ac.uk/census); there is no public direct URL to fingerprint.
+This probe returns a structured stub sentinel so ``hvantk drift`` reports a
+visible WARNING (status="stub") rather than a silent false-green. Replace with a
+real probe if a direct data URL becomes available. See issue #177.
 """
 from __future__ import annotations
 
+from hvantk.core.plugin.api import stub_fingerprint
+
+_REASON = (
+    "COSMIC Cancer Gene Census is login/license-gated "
+    "(cancer.sanger.ac.uk/census); no public direct URL to fingerprint"
+)
+
 
 def fetch_fingerprint() -> dict:
-    return {
-        "fingerprint": "sha256:phase-k-stub-not-implemented",
-        "probe_status": "stub",
-        "comment": "Phase K plugin promotion stub; replace with real probe.",
-    }
+    return stub_fingerprint(_REASON)
diff --git a/hvantk/skills/dbnsfp/catalog/datasets.json b/hvantk/skills/dbnsfp/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "dbNSFP_v4.7",
     "description": "Comprehensive database of functional predictions and annotations for human nonsynonymous and splice-site SNVs. Includes prediction scores from SIFT, PolyPhen2, CADD, GERP++, PhyloP, PhastCons, and many others for variant pathogenicity assessment.",
     "pubmedid": "21520341",
-    "data_source": "Custom",
+    "data_source": "dbNSFP",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "quarterly",
     "organism": "Homo sapiens",

diff --git a/hvantk/skills/dbnsfp/drift_probe.py b/hvantk/skills/dbnsfp/drift_probe.py
@@ -1,16 +1,20 @@
-"""Drift probe stub for dbnsfp.
+"""Drift probe for dbnsfp — documentation-only source (stub).
 
-Phase K plugin promotion lifts this source from the legacy hardcoded
-_TABLE_BUILDERS dict. A real drift probe (which queries the upstream
-source and returns a fingerprint) is a follow-up — for now this stub
-returns a placeholder fingerprint.
+dbNSFP is distributed from a landing page (sites.google.com/site/jpopgen/dbNSFP)
+with no stable direct data URL to fingerprint. This probe returns a structured
+stub sentinel so ``hvantk drift`` reports a visible WARNING (status="stub")
+rather than a silent false-green. Replace with a real probe if a direct data URL
+becomes available. See issue #177.
 """
 from __future__ import annotations
 
+from hvantk.core.plugin.api import stub_fingerprint
+
+_REASON = (
+    "dbNSFP is distributed from a landing page "
+    "(sites.google.com/site/jpopgen/dbNSFP); no stable direct data URL"
+)
+
 
 def fetch_fingerprint() -> dict:
-    return {
-        "fingerprint": "sha256:phase-k-stub-not-implemented",
-        "probe_status": "stub",
-        "comment": "Phase K plugin promotion stub; replace with real probe.",
-    }
+    return stub_fingerprint(_REASON)
diff --git a/hvantk/skills/ensembl_gene/catalog/datasets.json b/hvantk/skills/ensembl_gene/catalog/datasets.json
@@ -4,7 +4,7 @@
     "accession": "Ensembl_v110",
     "description": "Comprehensive gene annotations from the Ensembl project including gene models, transcript isoforms, protein sequences, regulatory features, and comparative genomics data across species.",
     "pubmedid": "31691826",
-    "data_source": "Custom",
+    "data_source": "Ensembl",
     "last_updated": "2025-09-23T16:30:00.000000",
     "update_frequency": "quarterly",
     "organism": "Homo sapiens",
@@ -35,7 +35,7 @@
       },
       {
         "path": "Homo_sapiens.GRCh38.110.gff3.gz",
-        "format": "gff",
+        "format": "gff3",
         "size_bytes": 900000000,
         "compression": "gzip",
         "description": "Complete gene annotations in GFF3 format"