Skip to content

Commit b21bdbd

Browse files
committed
Improve reproducibility and CI validation
1 parent de81844 commit b21bdbd

21 files changed

Lines changed: 14426 additions & 14303 deletions

.github/workflows/ci.yml

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
- name: Restore renv packages
2727
uses: r-lib/actions/setup-renv@v2
2828

29-
- name: Install missing dependencies
29+
- name: Restore pinned dependencies
3030
run: Rscript 000_install_dependencies.R
3131

3232
- name: Verify renv.lock unchanged
@@ -42,3 +42,46 @@ jobs:
4242

4343
- name: Tests
4444
run: Rscript -e 'testthat::test_dir("tests/testthat")'
45+
46+
pipeline-rebuild:
47+
runs-on: ubuntu-latest
48+
49+
steps:
50+
- name: Checkout
51+
uses: actions/checkout@v4
52+
53+
- name: Install system dependencies
54+
run: |
55+
sudo apt-get update
56+
sudo apt-get install -y libglpk40
57+
58+
- name: Setup R
59+
uses: r-lib/actions/setup-r@v2
60+
with:
61+
r-version: "4.5.2"
62+
use-public-rspm: true
63+
64+
- name: Restore renv packages
65+
uses: r-lib/actions/setup-renv@v2
66+
67+
- name: Install pinned dependencies
68+
run: Rscript 000_install_dependencies.R
69+
70+
- name: Remove committed analysis outputs before rebuild
71+
run: rm -f results/figures/*.png results/tables/*.csv
72+
73+
- name: Rebuild analysis outputs
74+
run: Rscript run_all.R
75+
76+
- name: Validate rebuilt outputs
77+
run: Rscript -e 'testthat::test_dir("tests/testthat")'
78+
79+
- name: Verify regenerated tables match tracked results
80+
run: git diff --exit-code -- results/tables
81+
82+
- name: Verify key figures match tracked results
83+
run: >
84+
git diff --exit-code --
85+
results/figures/qc_library_size.png
86+
results/figures/pca_plot.png
87+
results/figures/volcano_plot.png

000_install_dependencies.R

Lines changed: 14 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,36 @@
11
#!/usr/bin/env Rscript
2-
# Restore all required packages for the analysis pipeline using renv.
2+
# Restore the pinned project library from renv.lock.
33

44
is_macos <- identical(Sys.info()[["sysname"]], "Darwin")
5-
deps_fields <- c("Depends", "Imports", "LinkingTo")
65

7-
# Use a single, stable CRAN repo for both renv + non-renv installs.
86
options(repos = c(CRAN = "https://cloud.r-project.org"))
9-
Sys.setenv(RENV_CONFIG_PPM_ENABLED = "FALSE")
10-
11-
# Not every R distribution on macOS supports CRAN macOS binaries. Detect whether
12-
# a "binary" repo path exists; if not, fall back to source installs.
13-
binary_repo <- utils::contrib.url(getOption("repos"), type = "binary")
14-
source_repo <- utils::contrib.url(getOption("repos"), type = "source")
15-
has_cran_binaries <- is_macos && !identical(binary_repo, source_repo) && grepl("/bin/", binary_repo)
16-
install_type <- if (has_cran_binaries) "binary" else "source"
17-
options(pkgType = install_type)
7+
options(renv.consent = TRUE)
188

199
if (!file.exists("renv.lock")) {
2010
stop("File not found: renv.lock\nThis project uses renv for reproducible dependencies.")
2111
}
2212

23-
options(renv.consent = TRUE)
13+
if (is_macos) {
14+
binary_repo <- utils::contrib.url(getOption("repos"), type = "binary")
15+
source_repo <- utils::contrib.url(getOption("repos"), type = "source")
16+
has_cran_binaries <- !identical(binary_repo, source_repo) && grepl("/bin/", binary_repo)
17+
options(pkgType = if (has_cran_binaries) "binary" else "source")
18+
}
2419

2520
if (!requireNamespace("renv", quietly = TRUE)) {
2621
message("Installing renv...")
27-
install.packages("renv", dependencies = deps_fields)
28-
}
29-
30-
if (requireNamespace("BiocManager", quietly = TRUE)) {
31-
options(repos = BiocManager::repositories())
22+
install.packages("renv")
3223
}
3324

3425
message("Restoring packages from renv.lock...")
3526
renv::restore(prompt = FALSE)
3627

37-
if (is_macos && !requireNamespace("ggiraph", quietly = TRUE)) {
38-
message("\nPre-installing ggiraph (CRAN ", install_type, ")...")
39-
tryCatch(
40-
install.packages("ggiraph", type = install_type, dependencies = deps_fields),
41-
error = function(e) {
42-
message("ggiraph install failed: ", conditionMessage(e))
43-
message("If compilation is required, ensure Xcode CLT are installed:")
44-
message(" xcode-select --install")
45-
}
46-
)
47-
}
48-
49-
deps <- renv::dependencies()
50-
pkgs <- sort(unique(deps$Package))
51-
pkgs <- setdiff(pkgs, "renv")
52-
53-
missing <- pkgs[!vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1))]
54-
if (length(missing) > 0) {
55-
message("Installing missing packages: ", paste(missing, collapse = ", "))
56-
for (pkg in missing) {
57-
message("\n==> Installing: ", pkg)
58-
tryCatch(
59-
renv::install(
60-
pkg,
61-
prompt = FALSE,
62-
type = install_type,
63-
dependencies = deps_fields
64-
),
65-
error = function(e) {
66-
message("\nFailed installing ", pkg, ": ", conditionMessage(e))
67-
if (is_macos) {
68-
message("\nmacOS troubleshooting:")
69-
message(" - Ensure Xcode Command Line Tools are installed: xcode-select --install")
70-
message(" - Then retry: Rscript 000_install_dependencies.R")
71-
}
72-
stop(e)
73-
}
74-
)
28+
status_output <- capture.output(status <- renv::status(), type = "output")
29+
if (!isTRUE(status$synchronized)) {
30+
if (length(status_output) > 0) {
31+
message(paste(status_output, collapse = "\n"))
7532
}
76-
}
77-
78-
if (!identical(Sys.getenv("CI"), "true")) {
79-
message("Updating renv.lock...")
80-
renv::snapshot(prompt = FALSE)
81-
} else {
82-
message("CI detected; skipping renv.lock snapshot.")
33+
stop("renv restore completed, but the project library is still not synchronized.")
8334
}
8435

8536
message("\nDone. Run: Rscript run_all.R")

README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,15 @@ Hierarchical clustering of top 50 DE genes shows consistent expression patterns
9797

9898
### Pathway Enrichment
9999

100-
**529 GO Biological Process terms** and **28 KEGG pathways** significantly enriched (FDR < 0.05).
100+
**529 GO Biological Process terms** and **26 KEGG pathways** significantly enriched (FDR < 0.05).
101101

102102
![GO Enrichment](results/figures/go_dotplot.png)
103103

104104
Top GO terms: cytoplasmic translation, response to virus, defense response to virus.
105105

106106
![KEGG Enrichment](results/figures/kegg_dotplot.png)
107107

108-
Top KEGG pathway: **Coronavirus disease - COVID-19** (FDR = 1.5×10<sup>-40</sup>), followed by NOD-like receptor signalling.
108+
Top KEGG pathway: **Coronavirus disease - COVID-19** (FDR = 4.5×10<sup>-39</sup>), followed by NOD-like receptor signalling.
109109

110110
### ISG Signalling Cascade
111111

@@ -130,6 +130,7 @@ Analysis runtime: ~0.5 min after data download (~2GB).
130130

131131
### Notes
132132
- To re-download the GEO dataset (otherwise the pipeline reuses existing `data/*.rds` outputs): `FORCE_DOWNLOAD=true Rscript scripts/00_get_data.R`
133+
- To continue without KEGG results when the KEGG service is unavailable: `ALLOW_KEGG_FAILURE=true Rscript scripts/06_enrichment.R`
133134
- Lint: `Rscript dev/lint.R`
134135
- Tests: `Rscript -e 'testthat::test_dir("tests/testthat")'`
135136
- Reproducibility details (expected outputs, network requirements): see `REPRODUCIBILITY.md`
@@ -150,6 +151,11 @@ Rscript dev/lint.R
150151
Rscript -e 'testthat::test_dir("tests/testthat")'
151152
```
152153

154+
Maintainers updating dependencies should refresh the lockfile explicitly:
155+
```sh
156+
Rscript dev/snapshot_lockfile.R
157+
```
158+
153159
## Citation Metadata
154160
- Zenodo DOI: `10.5281/zenodo.18432519`
155161
- For citation tooling, see `CITATION.cff`
@@ -166,7 +172,8 @@ bulk-rnaseq-differential-expression/
166172
├── CITATION.cff
167173
├── REPRODUCIBILITY.md
168174
├── dev/
169-
│ └── lint.R # Lint scripts/ via lintr
175+
│ ├── lint.R # Lint scripts/ via lintr
176+
│ └── snapshot_lockfile.R # Maintainer-only renv.lock refresh
170177
├── renv/
171178
│ ├── activate.R
172179
│ └── settings.json
@@ -181,7 +188,8 @@ bulk-rnaseq-differential-expression/
181188
│ ├── 05_model_diagnostics.R
182189
│ ├── 06_enrichment.R
183190
│ ├── 07_reproducibility.R
184-
│ └── 08_pathway_diagram.R
191+
│ ├── 08_pathway_diagram.R
192+
│ └── config.R # Shared analysis thresholds and helpers
185193
├── data/
186194
│ └── [RDS files]
187195
├── results/
@@ -241,6 +249,8 @@ This project uses `renv` for reproducible dependencies. Install/restore everythi
241249
Rscript 000_install_dependencies.R
242250
```
243251

252+
This command restores the pinned project library only; it does not modify `renv.lock`.
253+
244254
### Manual installation (optional)
245255
If you prefer to install packages manually instead of using `renv`:
246256

REPRODUCIBILITY.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,19 @@ This repository is set up so a reviewer can reproduce the analysis with a small
1111
Run these commands from the repository root (i.e., a fresh clone):
1212

1313
```sh
14-
# Restore/install pinned dependencies (CRAN + Bioconductor)
14+
# Restore/install pinned dependencies from renv.lock
1515
Rscript 000_install_dependencies.R
1616

1717
# Run the full pipeline (downloads data if needed, then regenerates results)
1818
Rscript run_all.R
1919
```
2020

21+
Maintainers who intentionally change dependencies should refresh the lockfile explicitly:
22+
23+
```sh
24+
Rscript dev/snapshot_lockfile.R
25+
```
26+
2127
## Data Download Behaviour
2228
The data download step (`scripts/00_get_data.R`) is **idempotent**:
2329
- If `data/counts_raw.rds` and `data/metadata.rds` already exist, it will **skip** re-downloading.
@@ -33,6 +39,11 @@ Some steps require network access:
3339
- KEGG pathway annotation (via KEGG REST) in `scripts/06_enrichment.R`
3440

3541
If you are running in a restricted environment, these steps may fail until network access is available.
42+
If KEGG is temporarily unavailable and you still want the pipeline to continue locally, run:
43+
44+
```sh
45+
ALLOW_KEGG_FAILURE=true Rscript scripts/06_enrichment.R
46+
```
3647

3748
## Determinism
3849
The balanced subset selection uses a fixed seed (`set.seed(123)` in `scripts/01_qc.R`) so repeated runs should yield the same subset and downstream results, given the same package versions.
@@ -49,9 +60,11 @@ After a successful run, you should see (among others):
4960
# Check environment consistency against renv.lock
5061
Rscript -e 'renv::status()'
5162

52-
# Run smoke tests (validates presence/shape of committed artifacts)
63+
# Run output validation tests
5364
Rscript -e 'testthat::test_dir("tests/testthat")'
5465

5566
# Lint the analysis scripts
5667
Rscript dev/lint.R
5768
```
69+
70+
GitHub Actions also performs a clean rebuild of the tracked analysis outputs and checks that regenerated key outputs match the committed versions.

dev/snapshot_lockfile.R

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env Rscript
2+
# Maintainer helper: refresh renv.lock after intentionally changing dependencies.
3+
4+
if (!file.exists("renv.lock")) {
5+
stop("File not found: renv.lock")
6+
}
7+
8+
options(renv.consent = TRUE)
9+
10+
if (!requireNamespace("renv", quietly = TRUE)) {
11+
stop("Package 'renv' is required. Run: Rscript 000_install_dependencies.R")
12+
}
13+
14+
message("Updating renv.lock from the current project library...")
15+
renv::snapshot(prompt = FALSE)
16+
17+
status_output <- capture.output(status <- renv::status(), type = "output")
18+
if (!isTRUE(status$synchronized)) {
19+
if (length(status_output) > 0) {
20+
message(paste(status_output, collapse = "\n"))
21+
}
22+
stop("renv.lock was updated, but the project is still not synchronized.")
23+
}
24+
25+
message("renv.lock updated successfully.")
5 Bytes
Loading

results/figures/kegg_dotplot.png

-1.12 KB
Loading

results/figures/ma_plot.png

-10.3 KB
Loading
-1.26 KB
Loading

results/figures/volcano_plot.png

-27 Bytes
Loading

0 commit comments

Comments
 (0)