Skip to content

Commit 9ab3493

Browse files
committed
Add full pipeline smoke validation
1 parent 045479a commit 9ab3493

5 files changed

Lines changed: 214 additions & 1 deletion

File tree

.github/workflows/ci.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,25 @@ jobs:
3333
run: |
3434
pip install ruff
3535
ruff check scripts/ tests/
36+
37+
pipeline-smoke:
38+
runs-on: ubuntu-latest
39+
40+
steps:
41+
- uses: actions/checkout@v4
42+
43+
- name: Set up Python 3.12
44+
uses: actions/setup-python@v5
45+
with:
46+
python-version: "3.12"
47+
48+
- name: Install dependencies
49+
run: |
50+
python -m pip install --upgrade pip
51+
pip install -e ".[dev]"
52+
53+
- name: Run full pipeline
54+
run: python run_pipeline.py
55+
56+
- name: Validate generated outputs
57+
run: python scripts/validate_outputs.py

README.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ Single-cell RNA-seq analysis pipeline in Python using [scanpy](https://scanpy.re
88

99
Retains 2,604 PBMCs, resolves 5 major immune populations plus CD4/CD8 T-cell subclusters, with CI-tested reproducibility.
1010

11+
## Production Readiness
12+
13+
- CI runs unit tests across Python 3.10, 3.11, and 3.12.
14+
- CI also runs the complete PBMC pipeline on Python 3.12 and validates generated `.h5ad`, CSV, PNG, PDF, and manifest artefacts.
15+
- `results/output_manifest.csv` is generated on each full run with file sizes and SHA-256 checksums for pipeline outputs.
16+
- `scripts/validate_outputs.py` checks retained cell counts, required annotations, T-cell subtyping, marker tables, figures, and manifest integrity.
17+
1118
<p align="center">
1219
<img src="docs/umap_3d_rotation.gif" alt="3D UMAP rotation showing PBMC immune cell clusters" width="600">
1320
</p>
@@ -55,6 +62,9 @@ PBMC 3k (10X Genomics, 2,700 cells)
5562
5663
5764
08 Figures ─────── Multi-panel publication figure (PNG 300 DPI + PDF vector)
65+
66+
67+
09 Manifest ─────── SHA-256 manifest for generated analysis artefacts
5868
```
5969

6070
## Pipeline
@@ -69,6 +79,7 @@ PBMC 3k (10X Genomics, 2,700 cells)
6979
| 06 | `06_trajectory.py` | PAGA partition-based graph abstraction, PAGA-initialised UMAP, diffusion pseudotime |
7080
| 07 | `07_t_cell_subclustering.py` | Extract T cell compartment, subcluster, resolve CD4+/CD8+ via marker scoring |
7181
| 08 | `08_publication_figures.py` | Multi-panel figure with UMAP, composition, marker heatmap, summary (PNG + PDF) |
82+
| 09 | `09_output_manifest.py` | Generate checksums and file-size manifest for analysis artefacts |
7283

7384
## Project Structure
7485

@@ -83,6 +94,8 @@ single-cell-rnaseq-immune-profiling/
8394
│ ├── 06_trajectory.py PAGA + diffusion pseudotime
8495
│ ├── 07_t_cell_subclustering.py CD4+/CD8+ resolution
8596
│ ├── 08_publication_figures.py Multi-panel figure (PNG + PDF)
97+
│ ├── 09_output_manifest.py Output checksums and file sizes
98+
│ ├── validate_outputs.py Full-run output validator
8699
│ └── palette.py Shared Okabe-Ito colourblind palette
87100
├── tests/
88101
│ └── test_pipeline.py 7 tests (QC, normalisation, clustering, markers)
@@ -140,6 +153,7 @@ cd single-cell-rnaseq-immune-profiling
140153
pip install -e . # or: pip install -r requirements-lock.txt
141154
python run_pipeline.py # full pipeline (~38s)
142155
python run_pipeline.py --from 6 # resume from trajectory step
156+
python scripts/validate_outputs.py
143157
```
144158

145159
For exact reproducibility, use `requirements-lock.txt` which pins all dependency versions.
@@ -151,7 +165,7 @@ pip install -e ".[dev]"
151165
pytest -v
152166
```
153167

154-
7 tests covering QC filtering, normalisation, HVG selection, clustering, and marker gene validation. CI runs on Python 3.10, 3.11, and 3.12.
168+
7 tests covering QC filtering, normalisation, HVG selection, clustering, and marker gene validation. CI runs tests on Python 3.10, 3.11, and 3.12, then runs and validates the full pipeline on Python 3.12.
155169

156170
## Design Decisions
157171

run_pipeline.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
("06 Trajectory inference", "06_trajectory.py"),
2424
("07 T cell subclustering", "07_t_cell_subclustering.py"),
2525
("08 Publication figures", "08_publication_figures.py"),
26+
("09 Output manifest", "09_output_manifest.py"),
2627
]
2728

2829
N_STEPS = len(STEPS)

scripts/09_output_manifest.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python3
2+
"""Write a checksum manifest for generated pipeline artefacts."""
3+
4+
from __future__ import annotations
5+
6+
import csv
7+
import hashlib
8+
from pathlib import Path
9+
10+
RESULTS_DIR = Path("results")
11+
MANIFEST_PATH = RESULTS_DIR / "output_manifest.csv"
12+
INCLUDED_SUFFIXES = {".h5ad", ".csv", ".png", ".pdf"}
13+
14+
15+
def sha256_file(path: Path) -> str:
16+
digest = hashlib.sha256()
17+
with path.open("rb") as handle:
18+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
19+
digest.update(chunk)
20+
return digest.hexdigest()
21+
22+
23+
def iter_artefacts():
24+
for path in sorted(RESULTS_DIR.rglob("*")):
25+
if not path.is_file():
26+
continue
27+
if path == MANIFEST_PATH:
28+
continue
29+
if path.suffix.lower() in INCLUDED_SUFFIXES:
30+
yield path
31+
32+
33+
def main():
34+
RESULTS_DIR.mkdir(exist_ok=True)
35+
artefacts = list(iter_artefacts())
36+
if not artefacts:
37+
raise SystemExit("No generated artefacts found under results/")
38+
39+
with MANIFEST_PATH.open("w", newline="") as handle:
40+
writer = csv.DictWriter(handle, fieldnames=["path", "bytes", "sha256"])
41+
writer.writeheader()
42+
for path in artefacts:
43+
writer.writerow(
44+
{
45+
"path": path.as_posix(),
46+
"bytes": path.stat().st_size,
47+
"sha256": sha256_file(path),
48+
}
49+
)
50+
51+
print(f"Wrote {MANIFEST_PATH} for {len(artefacts)} artefacts")
52+
53+
54+
if __name__ == "__main__":
55+
main()

scripts/validate_outputs.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#!/usr/bin/env python3
2+
"""Validate outputs from a completed single-cell pipeline run."""
3+
4+
from __future__ import annotations
5+
6+
import hashlib
7+
from pathlib import Path
8+
9+
import pandas as pd
10+
import scanpy as sc
11+
12+
RESULTS_DIR = Path("results")
13+
14+
REQUIRED_FILES = [
15+
"01_filtered.h5ad",
16+
"02_preprocessed.h5ad",
17+
"03_reduced.h5ad",
18+
"04_clustered.h5ad",
19+
"05_annotated.h5ad",
20+
"06_trajectory.h5ad",
21+
"07_t_cells.h5ad",
22+
"marker_genes.csv",
23+
"cell_type_composition.csv",
24+
"figures/01_qc_metrics.png",
25+
"figures/02_hvg_selection.png",
26+
"figures/03_pca_variance.png",
27+
"figures/04_clustering.png",
28+
"figures/05_marker_dotplot.png",
29+
"figures/05_cell_types_umap.png",
30+
"figures/06_paga_graph.png",
31+
"figures/06_paga_graph.pdf",
32+
"figures/06_trajectory.png",
33+
"figures/06_trajectory.pdf",
34+
"figures/07_t_cell_subclustering.png",
35+
"figures/07_t_cell_subclustering.pdf",
36+
"figures/07_t_cell_markers.png",
37+
"figures/08_publication_figure.png",
38+
"figures/08_publication_figure.pdf",
39+
"output_manifest.csv",
40+
]
41+
42+
43+
def require_file(path: Path) -> None:
44+
if not path.exists():
45+
raise SystemExit(f"Missing expected output: {path}")
46+
if path.stat().st_size == 0:
47+
raise SystemExit(f"Output is empty: {path}")
48+
49+
50+
def sha256_file(path: Path) -> str:
51+
digest = hashlib.sha256()
52+
with path.open("rb") as handle:
53+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
54+
digest.update(chunk)
55+
return digest.hexdigest()
56+
57+
58+
def validate_manifest() -> None:
59+
manifest_path = RESULTS_DIR / "output_manifest.csv"
60+
manifest = pd.read_csv(manifest_path)
61+
required_cols = {"path", "bytes", "sha256"}
62+
if not required_cols.issubset(manifest.columns):
63+
raise SystemExit(f"Manifest missing columns: {required_cols - set(manifest.columns)}")
64+
if len(manifest) < 20:
65+
raise SystemExit("Manifest contains too few artefacts")
66+
67+
for row in manifest.itertuples(index=False):
68+
path = Path(row.path)
69+
require_file(path)
70+
if int(row.bytes) != path.stat().st_size:
71+
raise SystemExit(f"Manifest size mismatch for {path}")
72+
if row.sha256 != sha256_file(path):
73+
raise SystemExit(f"Manifest checksum mismatch for {path}")
74+
75+
76+
def validate_h5ad_outputs() -> None:
77+
adata = sc.read_h5ad(RESULTS_DIR / "06_trajectory.h5ad")
78+
if not 2500 <= adata.n_obs <= 2700:
79+
raise SystemExit(f"Unexpected retained cell count: {adata.n_obs}")
80+
for col in ["leiden", "cell_type", "dpt_pseudotime"]:
81+
if col not in adata.obs:
82+
raise SystemExit(f"Missing obs column in trajectory object: {col}")
83+
if adata.obs["cell_type"].nunique() < 5:
84+
raise SystemExit("Expected at least five annotated cell types")
85+
86+
t_cells = sc.read_h5ad(RESULTS_DIR / "07_t_cells.h5ad")
87+
if t_cells.n_obs < 500:
88+
raise SystemExit(f"Unexpected T cell subset size: {t_cells.n_obs}")
89+
if "t_subtype" not in t_cells.obs:
90+
raise SystemExit("Missing T cell subtype annotations")
91+
92+
93+
def validate_tables() -> None:
94+
composition = pd.read_csv(RESULTS_DIR / "cell_type_composition.csv", index_col=0)
95+
if composition.empty:
96+
raise SystemExit("Cell type composition table is empty")
97+
total = int(composition.iloc[:, 0].sum())
98+
99+
adata = sc.read_h5ad(RESULTS_DIR / "05_annotated.h5ad")
100+
if total != adata.n_obs:
101+
raise SystemExit(f"Composition total {total} != annotated cells {adata.n_obs}")
102+
103+
markers = pd.read_csv(RESULTS_DIR / "marker_genes.csv")
104+
required_marker_cols = {"group", "names", "scores", "pvals_adj", "logfoldchanges"}
105+
if not required_marker_cols.issubset(markers.columns):
106+
raise SystemExit("Marker table is missing expected rank_genes_groups columns")
107+
if len(markers) < 100:
108+
raise SystemExit("Marker table contains too few marker rows")
109+
110+
111+
def main() -> None:
112+
for relpath in REQUIRED_FILES:
113+
require_file(RESULTS_DIR / relpath)
114+
validate_manifest()
115+
validate_h5ad_outputs()
116+
validate_tables()
117+
print("Validated single-cell pipeline outputs")
118+
119+
120+
if __name__ == "__main__":
121+
main()

0 commit comments

Comments
 (0)