Add full pipeline smoke validation

Ekin-Kahraman · Ekin-Kahraman · commit 9ab34939cf63 · 2026-05-15T16:30:11.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -33,3 +33,25 @@ jobs:
         run: |
           pip install ruff
           ruff check scripts/ tests/
+
+  pipeline-smoke:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Run full pipeline
+        run: python run_pipeline.py
+
+      - name: Validate generated outputs
+        run: python scripts/validate_outputs.py
diff --git a/README.md b/README.md
@@ -8,6 +8,13 @@ Single-cell RNA-seq analysis pipeline in Python using [scanpy](https://scanpy.re
 
 Retains 2,604 PBMCs, resolves 5 major immune populations plus CD4/CD8 T-cell subclusters, with CI-tested reproducibility.
 
+## Production Readiness
+
+- CI runs unit tests across Python 3.10, 3.11, and 3.12.
+- CI also runs the complete PBMC pipeline on Python 3.12 and validates generated `.h5ad`, CSV, PNG, PDF, and manifest artefacts.
+- `results/output_manifest.csv` is generated on each full run with file sizes and SHA-256 checksums for pipeline outputs.
+- `scripts/validate_outputs.py` checks retained cell counts, required annotations, T-cell subtyping, marker tables, figures, and manifest integrity.
+
 <p align="center">
   <img src="docs/umap_3d_rotation.gif" alt="3D UMAP rotation showing PBMC immune cell clusters" width="600">
 </p>
@@ -55,6 +62,9 @@ PBMC 3k (10X Genomics, 2,700 cells)
     │
     ▼
  08 Figures ─────── Multi-panel publication figure (PNG 300 DPI + PDF vector)
+    │
+    ▼
+ 09 Manifest ─────── SHA-256 manifest for generated analysis artefacts
 ```
 
 ## Pipeline
@@ -69,6 +79,7 @@ PBMC 3k (10X Genomics, 2,700 cells)
 | 06 | `06_trajectory.py` | PAGA partition-based graph abstraction, PAGA-initialised UMAP, diffusion pseudotime |
 | 07 | `07_t_cell_subclustering.py` | Extract T cell compartment, subcluster, resolve CD4+/CD8+ via marker scoring |
 | 08 | `08_publication_figures.py` | Multi-panel figure with UMAP, composition, marker heatmap, summary (PNG + PDF) |
+| 09 | `09_output_manifest.py` | Generate checksums and file-size manifest for analysis artefacts |
 
 ## Project Structure
 
@@ -83,6 +94,8 @@ single-cell-rnaseq-immune-profiling/
 │   ├── 06_trajectory.py            PAGA + diffusion pseudotime
 │   ├── 07_t_cell_subclustering.py  CD4+/CD8+ resolution
 │   ├── 08_publication_figures.py   Multi-panel figure (PNG + PDF)
+│   ├── 09_output_manifest.py        Output checksums and file sizes
+│   ├── validate_outputs.py          Full-run output validator
 │   └── palette.py                  Shared Okabe-Ito colourblind palette
 ├── tests/
 │   └── test_pipeline.py            7 tests (QC, normalisation, clustering, markers)
@@ -140,6 +153,7 @@ cd single-cell-rnaseq-immune-profiling
 pip install -e .                  # or: pip install -r requirements-lock.txt
 python run_pipeline.py            # full pipeline (~38s)
 python run_pipeline.py --from 6   # resume from trajectory step
+python scripts/validate_outputs.py
 ```
 
 For exact reproducibility, use `requirements-lock.txt` which pins all dependency versions.
@@ -151,7 +165,7 @@ pip install -e ".[dev]"
 pytest -v
 ```
 
-7 tests covering QC filtering, normalisation, HVG selection, clustering, and marker gene validation. CI runs on Python 3.10, 3.11, and 3.12.
+7 tests covering QC filtering, normalisation, HVG selection, clustering, and marker gene validation. CI runs tests on Python 3.10, 3.11, and 3.12, then runs and validates the full pipeline on Python 3.12.
 
 ## Design Decisions
 
diff --git a/run_pipeline.py b/run_pipeline.py
@@ -23,6 +23,7 @@
     ("06 Trajectory inference", "06_trajectory.py"),
     ("07 T cell subclustering", "07_t_cell_subclustering.py"),
     ("08 Publication figures", "08_publication_figures.py"),
+    ("09 Output manifest", "09_output_manifest.py"),
 ]
 
 N_STEPS = len(STEPS)
diff --git a/scripts/09_output_manifest.py b/scripts/09_output_manifest.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Write a checksum manifest for generated pipeline artefacts."""
+
+from __future__ import annotations
+
+import csv
+import hashlib
+from pathlib import Path
+
+RESULTS_DIR = Path("results")
+MANIFEST_PATH = RESULTS_DIR / "output_manifest.csv"
+INCLUDED_SUFFIXES = {".h5ad", ".csv", ".png", ".pdf"}
+
+
+def sha256_file(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def iter_artefacts():
+    for path in sorted(RESULTS_DIR.rglob("*")):
+        if not path.is_file():
+            continue
+        if path == MANIFEST_PATH:
+            continue
+        if path.suffix.lower() in INCLUDED_SUFFIXES:
+            yield path
+
+
+def main():
+    RESULTS_DIR.mkdir(exist_ok=True)
+    artefacts = list(iter_artefacts())
+    if not artefacts:
+        raise SystemExit("No generated artefacts found under results/")
+
+    with MANIFEST_PATH.open("w", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=["path", "bytes", "sha256"])
+        writer.writeheader()
+        for path in artefacts:
+            writer.writerow(
+                {
+                    "path": path.as_posix(),
+                    "bytes": path.stat().st_size,
+                    "sha256": sha256_file(path),
+                }
+            )
+
+    print(f"Wrote {MANIFEST_PATH} for {len(artefacts)} artefacts")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/validate_outputs.py b/scripts/validate_outputs.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""Validate outputs from a completed single-cell pipeline run."""
+
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+import pandas as pd
+import scanpy as sc
+
+RESULTS_DIR = Path("results")
+
+REQUIRED_FILES = [
+    "01_filtered.h5ad",
+    "02_preprocessed.h5ad",
+    "03_reduced.h5ad",
+    "04_clustered.h5ad",
+    "05_annotated.h5ad",
+    "06_trajectory.h5ad",
+    "07_t_cells.h5ad",
+    "marker_genes.csv",
+    "cell_type_composition.csv",
+    "figures/01_qc_metrics.png",
+    "figures/02_hvg_selection.png",
+    "figures/03_pca_variance.png",
+    "figures/04_clustering.png",
+    "figures/05_marker_dotplot.png",
+    "figures/05_cell_types_umap.png",
+    "figures/06_paga_graph.png",
+    "figures/06_paga_graph.pdf",
+    "figures/06_trajectory.png",
+    "figures/06_trajectory.pdf",
+    "figures/07_t_cell_subclustering.png",
+    "figures/07_t_cell_subclustering.pdf",
+    "figures/07_t_cell_markers.png",
+    "figures/08_publication_figure.png",
+    "figures/08_publication_figure.pdf",
+    "output_manifest.csv",
+]
+
+
+def require_file(path: Path) -> None:
+    if not path.exists():
+        raise SystemExit(f"Missing expected output: {path}")
+    if path.stat().st_size == 0:
+        raise SystemExit(f"Output is empty: {path}")
+
+
+def sha256_file(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def validate_manifest() -> None:
+    manifest_path = RESULTS_DIR / "output_manifest.csv"
+    manifest = pd.read_csv(manifest_path)
+    required_cols = {"path", "bytes", "sha256"}
+    if not required_cols.issubset(manifest.columns):
+        raise SystemExit(f"Manifest missing columns: {required_cols - set(manifest.columns)}")
+    if len(manifest) < 20:
+        raise SystemExit("Manifest contains too few artefacts")
+
+    for row in manifest.itertuples(index=False):
+        path = Path(row.path)
+        require_file(path)
+        if int(row.bytes) != path.stat().st_size:
+            raise SystemExit(f"Manifest size mismatch for {path}")
+        if row.sha256 != sha256_file(path):
+            raise SystemExit(f"Manifest checksum mismatch for {path}")
+
+
+def validate_h5ad_outputs() -> None:
+    adata = sc.read_h5ad(RESULTS_DIR / "06_trajectory.h5ad")
+    if not 2500 <= adata.n_obs <= 2700:
+        raise SystemExit(f"Unexpected retained cell count: {adata.n_obs}")
+    for col in ["leiden", "cell_type", "dpt_pseudotime"]:
+        if col not in adata.obs:
+            raise SystemExit(f"Missing obs column in trajectory object: {col}")
+    if adata.obs["cell_type"].nunique() < 5:
+        raise SystemExit("Expected at least five annotated cell types")
+
+    t_cells = sc.read_h5ad(RESULTS_DIR / "07_t_cells.h5ad")
+    if t_cells.n_obs < 500:
+        raise SystemExit(f"Unexpected T cell subset size: {t_cells.n_obs}")
+    if "t_subtype" not in t_cells.obs:
+        raise SystemExit("Missing T cell subtype annotations")
+
+
+def validate_tables() -> None:
+    composition = pd.read_csv(RESULTS_DIR / "cell_type_composition.csv", index_col=0)
+    if composition.empty:
+        raise SystemExit("Cell type composition table is empty")
+    total = int(composition.iloc[:, 0].sum())
+
+    adata = sc.read_h5ad(RESULTS_DIR / "05_annotated.h5ad")
+    if total != adata.n_obs:
+        raise SystemExit(f"Composition total {total} != annotated cells {adata.n_obs}")
+
+    markers = pd.read_csv(RESULTS_DIR / "marker_genes.csv")
+    required_marker_cols = {"group", "names", "scores", "pvals_adj", "logfoldchanges"}
+    if not required_marker_cols.issubset(markers.columns):
+        raise SystemExit("Marker table is missing expected rank_genes_groups columns")
+    if len(markers) < 100:
+        raise SystemExit("Marker table contains too few marker rows")
+
+
+def main() -> None:
+    for relpath in REQUIRED_FILES:
+        require_file(RESULTS_DIR / relpath)
+    validate_manifest()
+    validate_h5ad_outputs()
+    validate_tables()
+    print("Validated single-cell pipeline outputs")
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@`
`23`	`23`	`("06 Trajectory inference", "06_trajectory.py"),`
`24`	`24`	`("07 T cell subclustering", "07_t_cell_subclustering.py"),`
`25`	`25`	`("08 Publication figures", "08_publication_figures.py"),`
	`26`	`+ ("09 Output manifest", "09_output_manifest.py"),`
`26`	`27`	`]`
`27`	`28`
`28`	`29`	`N_STEPS = len(STEPS)`