|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Validate outputs from a completed single-cell pipeline run.""" |
| 3 | + |
| 4 | +from __future__ import annotations |
| 5 | + |
| 6 | +import hashlib |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | +import pandas as pd |
| 10 | +import scanpy as sc |
| 11 | + |
| 12 | +RESULTS_DIR = Path("results") |
| 13 | + |
| 14 | +REQUIRED_FILES = [ |
| 15 | + "01_filtered.h5ad", |
| 16 | + "02_preprocessed.h5ad", |
| 17 | + "03_reduced.h5ad", |
| 18 | + "04_clustered.h5ad", |
| 19 | + "05_annotated.h5ad", |
| 20 | + "06_trajectory.h5ad", |
| 21 | + "07_t_cells.h5ad", |
| 22 | + "marker_genes.csv", |
| 23 | + "cell_type_composition.csv", |
| 24 | + "figures/01_qc_metrics.png", |
| 25 | + "figures/02_hvg_selection.png", |
| 26 | + "figures/03_pca_variance.png", |
| 27 | + "figures/04_clustering.png", |
| 28 | + "figures/05_marker_dotplot.png", |
| 29 | + "figures/05_cell_types_umap.png", |
| 30 | + "figures/06_paga_graph.png", |
| 31 | + "figures/06_paga_graph.pdf", |
| 32 | + "figures/06_trajectory.png", |
| 33 | + "figures/06_trajectory.pdf", |
| 34 | + "figures/07_t_cell_subclustering.png", |
| 35 | + "figures/07_t_cell_subclustering.pdf", |
| 36 | + "figures/07_t_cell_markers.png", |
| 37 | + "figures/08_publication_figure.png", |
| 38 | + "figures/08_publication_figure.pdf", |
| 39 | + "output_manifest.csv", |
| 40 | +] |
| 41 | + |
| 42 | + |
| 43 | +def require_file(path: Path) -> None: |
| 44 | + if not path.exists(): |
| 45 | + raise SystemExit(f"Missing expected output: {path}") |
| 46 | + if path.stat().st_size == 0: |
| 47 | + raise SystemExit(f"Output is empty: {path}") |
| 48 | + |
| 49 | + |
| 50 | +def sha256_file(path: Path) -> str: |
| 51 | + digest = hashlib.sha256() |
| 52 | + with path.open("rb") as handle: |
| 53 | + for chunk in iter(lambda: handle.read(1024 * 1024), b""): |
| 54 | + digest.update(chunk) |
| 55 | + return digest.hexdigest() |
| 56 | + |
| 57 | + |
| 58 | +def validate_manifest() -> None: |
| 59 | + manifest_path = RESULTS_DIR / "output_manifest.csv" |
| 60 | + manifest = pd.read_csv(manifest_path) |
| 61 | + required_cols = {"path", "bytes", "sha256"} |
| 62 | + if not required_cols.issubset(manifest.columns): |
| 63 | + raise SystemExit(f"Manifest missing columns: {required_cols - set(manifest.columns)}") |
| 64 | + if len(manifest) < 20: |
| 65 | + raise SystemExit("Manifest contains too few artefacts") |
| 66 | + |
| 67 | + for row in manifest.itertuples(index=False): |
| 68 | + path = Path(row.path) |
| 69 | + require_file(path) |
| 70 | + if int(row.bytes) != path.stat().st_size: |
| 71 | + raise SystemExit(f"Manifest size mismatch for {path}") |
| 72 | + if row.sha256 != sha256_file(path): |
| 73 | + raise SystemExit(f"Manifest checksum mismatch for {path}") |
| 74 | + |
| 75 | + |
| 76 | +def validate_h5ad_outputs() -> None: |
| 77 | + adata = sc.read_h5ad(RESULTS_DIR / "06_trajectory.h5ad") |
| 78 | + if not 2500 <= adata.n_obs <= 2700: |
| 79 | + raise SystemExit(f"Unexpected retained cell count: {adata.n_obs}") |
| 80 | + for col in ["leiden", "cell_type", "dpt_pseudotime"]: |
| 81 | + if col not in adata.obs: |
| 82 | + raise SystemExit(f"Missing obs column in trajectory object: {col}") |
| 83 | + if adata.obs["cell_type"].nunique() < 5: |
| 84 | + raise SystemExit("Expected at least five annotated cell types") |
| 85 | + |
| 86 | + t_cells = sc.read_h5ad(RESULTS_DIR / "07_t_cells.h5ad") |
| 87 | + if t_cells.n_obs < 500: |
| 88 | + raise SystemExit(f"Unexpected T cell subset size: {t_cells.n_obs}") |
| 89 | + if "t_subtype" not in t_cells.obs: |
| 90 | + raise SystemExit("Missing T cell subtype annotations") |
| 91 | + |
| 92 | + |
| 93 | +def validate_tables() -> None: |
| 94 | + composition = pd.read_csv(RESULTS_DIR / "cell_type_composition.csv", index_col=0) |
| 95 | + if composition.empty: |
| 96 | + raise SystemExit("Cell type composition table is empty") |
| 97 | + total = int(composition.iloc[:, 0].sum()) |
| 98 | + |
| 99 | + adata = sc.read_h5ad(RESULTS_DIR / "05_annotated.h5ad") |
| 100 | + if total != adata.n_obs: |
| 101 | + raise SystemExit(f"Composition total {total} != annotated cells {adata.n_obs}") |
| 102 | + |
| 103 | + markers = pd.read_csv(RESULTS_DIR / "marker_genes.csv") |
| 104 | + required_marker_cols = {"group", "names", "scores", "pvals_adj", "logfoldchanges"} |
| 105 | + if not required_marker_cols.issubset(markers.columns): |
| 106 | + raise SystemExit("Marker table is missing expected rank_genes_groups columns") |
| 107 | + if len(markers) < 100: |
| 108 | + raise SystemExit("Marker table contains too few marker rows") |
| 109 | + |
| 110 | + |
| 111 | +def main() -> None: |
| 112 | + for relpath in REQUIRED_FILES: |
| 113 | + require_file(RESULTS_DIR / relpath) |
| 114 | + validate_manifest() |
| 115 | + validate_h5ad_outputs() |
| 116 | + validate_tables() |
| 117 | + print("Validated single-cell pipeline outputs") |
| 118 | + |
| 119 | + |
| 120 | +if __name__ == "__main__": |
| 121 | + main() |
0 commit comments