Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions .github/scripts/drift_to_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
[
{
"dataset_name": "clinvar:variants",
"status": "clean" | "drifted" | "probe_failed",
"status": "clean" | "drifted" | "probe_failed" | "stub",
"observed": {...} | null,
"expected": {...} | null,
"diff": {...} | null,
Expand All @@ -32,7 +32,10 @@
6. Opens a draft PR (or updates the body of an existing one).

Probe-failed entries are recorded in the GitHub step summary but never
produce a PR (they are infrastructure failures, not data drift).
produce a PR (they are infrastructure failures, not data drift). ``stub``
entries (documentation-only sources with no programmatic probe) are likewise
never turned into PRs; they are surfaced in the summary count so they are not
silently dropped.

Exit code is always 0 unless a wholly unexpected error escapes; drift /
probe-failed are not workflow failures.
Expand Down Expand Up @@ -366,6 +369,26 @@ def handle_probe_failed(
)


def handle_stub(
entry: dict,
*,
step_summary: Path | None,
) -> None:
"""Surface a documentation-only stub in the rendered step summary so it is
not silently dropped. Stubs never open a PR (they are not data drift) and
never fail the workflow (they exit clean)."""
dataset = entry["dataset_name"]
reason = (entry.get("observed") or {}).get(
"reason", "documentation-only source; no programmatic probe"
)
print(f"\n=== Stub (no real drift detection): {dataset} ===")
print(f" {reason}")
_summary_line(
step_summary,
f"- STUB: `{dataset}` -- {reason}",
)


def _summary_line(step_summary: Path | None, line: str) -> None:
if step_summary is None:
return
Expand Down Expand Up @@ -427,10 +450,11 @@ def main(argv: list[str] | None = None) -> int:
drifted = [e for e in report if e.get("status") == "drifted"]
probe_failed = [e for e in report if e.get("status") == "probe_failed"]
clean = [e for e in report if e.get("status") == "clean"]
stub = [e for e in report if e.get("status") == "stub"]

print(
f"summary: {len(drifted)} drifted, {len(probe_failed)} probe_failed, "
f"{len(clean)} clean"
f"{len(clean)} clean, {len(stub)} stub"
)

for entry in drifted:
Expand Down Expand Up @@ -458,6 +482,9 @@ def main(argv: list[str] | None = None) -> int:
for entry in probe_failed:
handle_probe_failed(entry, step_summary=step_summary)

for entry in stub:
handle_stub(entry, step_summary=step_summary)

return 0


Expand Down
16 changes: 15 additions & 1 deletion hvantk/algorithms/enrichex/burden.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,11 @@ def _build_gene_to_sets_ht(gene_sets: Dict[str, List[str]]) -> "hl.Table":
"""Build a ``gene -> [gene_set_ids]`` table from the gene-set definitions."""
gene_to_sets: Dict[str, List[str]] = {}
for gs_name, genes in gene_sets.items():
for gene in genes:
# dict.fromkeys dedups within a single set while preserving order, so a
# gene listed twice in one set is not double-counted after explode_rows
# (n_genes_found / burden). set() would lose order; the canonical
# parse_geneset_tsv already dedups upstream, so this guards other callers.
for gene in dict.fromkeys(genes):
if gene not in gene_to_sets:
gene_to_sets[gene] = []
gene_to_sets[gene].append(gs_name)
Expand Down Expand Up @@ -394,6 +398,16 @@ def compute_geneset_burden_mt(
"""
_require_hail()

# Normalize each set's gene list (dedup within set, order-preserving) up
# front so the min_gene_set_size filter, gene_set_size, gene_coverage_pct,
# and the gene->set membership are all derived from the same deduped genes.
# No-op for the canonical pipeline (parse_geneset_tsv already dedups); this
# only affects callers passing raw, duplicate-containing lists, where the
# deduped membership would otherwise disagree with len()-based sizes.
gene_sets = {
name: list(dict.fromkeys(genes)) for name, genes in gene_sets.items()
}

# Pre-filter gene sets by minimum size
if min_gene_set_size > 0:
n_before = len(gene_sets)
Expand Down
7 changes: 6 additions & 1 deletion hvantk/algorithms/hgc/qc_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,13 @@ def render_qc_summary_markdown(summary_data: Dict[str, dict]) -> str:
for stat in _SUMMARY_STAT_COLUMNS:
value = stats.get(stat, "N/A")
if isinstance(value, (int, float)) and stat != "count":
# Use scientific notation for very small magnitudes
# (e.g. tiny HWE p-values) so they don't collapse to
# "0.000"; keep an exact 0 as "0.000".
value = (
f"{value:.3f}" if abs(value) < 1000 else f"{value:.2e}"
f"{value:.3f}"
if value == 0 or 1e-3 <= abs(value) < 1000
else f"{value:.2e}"
)
row += f" {value} |"
md_content += row + "\n"
Expand Down
27 changes: 27 additions & 0 deletions hvantk/core/plugin/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,33 @@
# not flip drift status or invalidate stored artifact fingerprints.
PROBE_FINGERPRINT_IGNORED_KEYS = frozenset({"fetched_at", "probe_version"})

# Sentinel value for ``probe_status`` marking a drift probe as an intentional
# stub: a documentation-only / license-gated / publication-only source with no
# stable, programmatically-probeable direct URL. ``drift_runner`` reports these
# as status="stub" (a visible WARNING) instead of a silent false-green "clean"
# or a misleading "probe_failed". See issue #177.
PROBE_STATUS_STUB = "stub"

# Honest provenance token recorded by ``run_builder._coerce_fingerprint`` for a
# stubbed source (the ``fingerprint`` key wins there). Self-describing rather
# than a fake ``sha256:...`` hash, so provenance never implies a real probe ran.
STUB_FINGERPRINT_TOKEN = "stub:no-programmatic-source"


def stub_fingerprint(reason: str) -> dict:
"""Build the structured sentinel a documentation-only drift probe returns.

Use this from ``fetch_fingerprint()`` when the source cannot be fingerprinted
programmatically (manual/gated/publication-only acquisition). ``reason``
explains why (surfaced in the ``hvantk drift`` WARNING). Returning this marks
the dataset as status="stub" rather than producing a false-green drift result.
"""
return {
"probe_status": PROBE_STATUS_STUB,
"reason": reason,
"fingerprint": STUB_FINGERPRINT_TOKEN,
}


class Builder(Protocol):
"""Phase B builder contract used by ``DatasetSpec.builder``.
Expand Down
66 changes: 47 additions & 19 deletions hvantk/core/plugin/drift_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,18 @@
from pathlib import Path
from typing import Any

from .api import DatasetSpec, DriftProbeError, PROBE_FINGERPRINT_IGNORED_KEYS
from .api import (
DatasetSpec,
DriftProbeError,
PROBE_FINGERPRINT_IGNORED_KEYS,
PROBE_STATUS_STUB,
)


@dataclass(frozen=True)
class DriftResult:
dataset_name: str
status: str # clean | drifted | probe_failed
status: str # clean | drifted | probe_failed | stub
observed: dict[str, Any] | None = None
expected: dict[str, Any] | None = None
diff: dict[str, Any] | None = None
Expand All @@ -32,38 +37,61 @@ def run_drift_check(dataset_name: str, *, timeout: int = 60) -> DriftResult:
return _run_drift_check_with_spec(spec, timeout=timeout)


def _probe_failed(spec: DatasetSpec, exc: DriftProbeError) -> DriftResult:
"""Build a probe_failed result, also flagging a missing baseline fingerprint.

The probe runs before the baseline is read (so intentional stubs classify
correctly). Without this, a plugin whose probe fails AND ships no committed
baseline would surface only the probe error and silently hide the
missing-fingerprint configuration issue. The underlying probe error is
preserved either way.
"""
fp_path = Path(spec.test_paths.drift_fingerprint)
if not fp_path.exists():
exc = DriftProbeError(
f"{exc}; additionally, expected fingerprint is missing at {fp_path}"
)
return DriftResult(
dataset_name=spec.name, status="probe_failed", probe_error=exc
)


def _run_drift_check_with_spec(
spec: DatasetSpec, *, timeout: int = 60
) -> DriftResult:
# Invoke the probe before loading the baseline so an intentional stub
# (documentation-only source with no probeable URL) is reported as
# status="stub" — these plugins ship no committed baseline, so a
# baseline-first ordering would mislabel them as "probe_failed".
try:
observed = _invoke_with_timeout(spec.drift_probe, timeout=timeout)
except DriftProbeError as exc:
return _probe_failed(spec, exc)
except Exception as exc: # noqa: BLE001
return _probe_failed(
spec, DriftProbeError(f"probe raised {type(exc).__name__}: {exc}")
)

if observed.get("probe_status") == PROBE_STATUS_STUB:
return DriftResult(
dataset_name=spec.name,
status="stub",
observed=observed,
)

fp_path = Path(spec.test_paths.drift_fingerprint)
try:
expected = json.loads(fp_path.read_text())
except FileNotFoundError:
return DriftResult(
dataset_name=spec.name,
status="probe_failed",
observed=observed,
probe_error=DriftProbeError(
f"missing expected fingerprint at {fp_path}"
),
)

try:
observed = _invoke_with_timeout(spec.drift_probe, timeout=timeout)
except DriftProbeError as exc:
return DriftResult(
dataset_name=spec.name,
status="probe_failed",
expected=expected,
probe_error=exc,
)
except Exception as exc: # noqa: BLE001
return DriftResult(
dataset_name=spec.name,
status="probe_failed",
expected=expected,
probe_error=DriftProbeError(f"probe raised {type(exc).__name__}: {exc}"),
)

diff = _compare_fingerprints(expected, observed)
if diff is None:
return DriftResult(
Expand Down
24 changes: 14 additions & 10 deletions hvantk/skills/alphagenome/drift_probe.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""Drift probe stub for alphagenome.
"""Drift probe for alphagenome — documentation-only source (stub).

Phase K plugin promotion lifts this source from the legacy hardcoded
_TABLE_BUILDERS dict. A real drift probe (which queries the upstream
source and returns a fingerprint) is a follow-up — for now this stub
returns a placeholder fingerprint.
AlphaGenome is consumed as a live prediction API requiring credentials; there is
no static data file with a stable, programmatically-probeable URL to fingerprint.
This probe returns a structured stub sentinel so ``hvantk drift`` reports a
visible WARNING (status="stub") rather than a silent false-green. Replace with a
real probe if a direct data URL becomes available. See issue #177.
"""
from __future__ import annotations

from hvantk.core.plugin.api import stub_fingerprint

_REASON = (
"AlphaGenome is a live prediction API requiring credentials; "
"no static data file to fingerprint"
)


def fetch_fingerprint() -> dict:
return {
"fingerprint": "sha256:phase-k-stub-not-implemented",
"probe_status": "stub",
"comment": "Phase K plugin promotion stub; replace with real probe.",
}
return stub_fingerprint(_REASON)
2 changes: 1 addition & 1 deletion hvantk/skills/clingen/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "ClinGen_GeneDisease",
"description": "Curated gene-disease associations with evidence-based classifications from ClinGen. Includes classification levels (Definitive, Strong, Moderate, Limited), mode of inheritance, and MONDO disease ontology mappings.",
"pubmedid": "28552198",
"data_source": "Custom",
"data_source": "ClinGen",
"last_updated": "2026-03-06T00:00:00.000000",
"update_frequency": "monthly",
"organism": "Homo sapiens",
Expand Down
2 changes: 1 addition & 1 deletion hvantk/skills/clinvar/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "ClinVar_latest",
"description": "Public archive of reports of the relationships among human variations and phenotypes, with supporting evidence. ClinVar facilitates access to and communication about the relationships asserted between human variation and observed health status.",
"pubmedid": "24234437",
"data_source": "Custom",
"data_source": "ClinVar",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "monthly",
"organism": "Homo sapiens",
Expand Down
24 changes: 14 additions & 10 deletions hvantk/skills/cosmic_cgc/drift_probe.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""Drift probe stub for cosmic-cgc.
"""Drift probe for cosmic-cgc — documentation-only source (stub).

Phase K plugin promotion lifts this source from the legacy hardcoded
_TABLE_BUILDERS dict. A real drift probe (which queries the upstream
source and returns a fingerprint) is a follow-up — for now this stub
returns a placeholder fingerprint.
The COSMIC Cancer Gene Census is behind a login/license gate
(cancer.sanger.ac.uk/census); there is no public direct URL to fingerprint.
This probe returns a structured stub sentinel so ``hvantk drift`` reports a
visible WARNING (status="stub") rather than a silent false-green. Replace with a
real probe if a direct data URL becomes available. See issue #177.
"""
from __future__ import annotations

from hvantk.core.plugin.api import stub_fingerprint

_REASON = (
"COSMIC Cancer Gene Census is login/license-gated "
"(cancer.sanger.ac.uk/census); no public direct URL to fingerprint"
)


def fetch_fingerprint() -> dict:
return {
"fingerprint": "sha256:phase-k-stub-not-implemented",
"probe_status": "stub",
"comment": "Phase K plugin promotion stub; replace with real probe.",
}
return stub_fingerprint(_REASON)
2 changes: 1 addition & 1 deletion hvantk/skills/dbnsfp/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "dbNSFP_v4.7",
"description": "Comprehensive database of functional predictions and annotations for human nonsynonymous and splice-site SNVs. Includes prediction scores from SIFT, PolyPhen2, CADD, GERP++, PhyloP, PhastCons, and many others for variant pathogenicity assessment.",
"pubmedid": "21520341",
"data_source": "Custom",
"data_source": "dbNSFP",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "quarterly",
"organism": "Homo sapiens",
Expand Down
24 changes: 14 additions & 10 deletions hvantk/skills/dbnsfp/drift_probe.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""Drift probe stub for dbnsfp.
"""Drift probe for dbnsfp — documentation-only source (stub).

Phase K plugin promotion lifts this source from the legacy hardcoded
_TABLE_BUILDERS dict. A real drift probe (which queries the upstream
source and returns a fingerprint) is a follow-up — for now this stub
returns a placeholder fingerprint.
dbNSFP is distributed from a landing page (sites.google.com/site/jpopgen/dbNSFP)
with no stable direct data URL to fingerprint. This probe returns a structured
stub sentinel so ``hvantk drift`` reports a visible WARNING (status="stub")
rather than a silent false-green. Replace with a real probe if a direct data URL
becomes available. See issue #177.
"""
from __future__ import annotations

from hvantk.core.plugin.api import stub_fingerprint

_REASON = (
"dbNSFP is distributed from a landing page "
"(sites.google.com/site/jpopgen/dbNSFP); no stable direct data URL"
)


def fetch_fingerprint() -> dict:
return {
"fingerprint": "sha256:phase-k-stub-not-implemented",
"probe_status": "stub",
"comment": "Phase K plugin promotion stub; replace with real probe.",
}
return stub_fingerprint(_REASON)
4 changes: 2 additions & 2 deletions hvantk/skills/ensembl_gene/catalog/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"accession": "Ensembl_v110",
"description": "Comprehensive gene annotations from the Ensembl project including gene models, transcript isoforms, protein sequences, regulatory features, and comparative genomics data across species.",
"pubmedid": "31691826",
"data_source": "Custom",
"data_source": "Ensembl",
"last_updated": "2025-09-23T16:30:00.000000",
"update_frequency": "quarterly",
"organism": "Homo sapiens",
Expand Down Expand Up @@ -35,7 +35,7 @@
},
{
"path": "Homo_sapiens.GRCh38.110.gff3.gz",
"format": "gff",
"format": "gff3",
"size_bytes": 900000000,
"compression": "gzip",
"description": "Complete gene annotations in GFF3 format"
Expand Down
Loading
Loading