Skip to content

Commit 60ee740

Browse files
committed
Add CI, airway dataset samplesheet, configurable reference level
- GitHub Actions CI runs test profile on push/PR - Samplesheet updated for Himes et al. airway dataset (GSE52778): 4 samples, 2 untreated + 2 dexamethasone - DESeq2 reference condition now configurable via params.ref_condition - Adaptive gene filter: min_count=10 for real data, min_count=1 for synthetic test data (based on library size) - Contrast auto-detects treatment vs reference level
1 parent 7515205 commit 60ee740

5 files changed

Lines changed: 70 additions & 16 deletions

File tree

.github/workflows/ci.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: actions/checkout@v4
14+
15+
- uses: actions/setup-java@v4
16+
with:
17+
distribution: temurin
18+
java-version: 21
19+
20+
- name: Install Nextflow
21+
run: |
22+
curl -s https://get.nextflow.io | bash
23+
sudo mv nextflow /usr/local/bin/
24+
25+
- name: Generate test data
26+
run: python test/create_test_data.py
27+
28+
- name: Build test HISAT2 index
29+
run: |
30+
docker run --rm -v "$GITHUB_WORKSPACE/test:/data" \
31+
quay.io/biocontainers/hisat2:2.2.1--hdbdd923_6 \
32+
hisat2-build -q /data/genome.fa /data/genome
33+
34+
- name: Run pipeline (test profile)
35+
run: |
36+
nextflow run main.nf \
37+
-profile test,docker \
38+
--genome_index "$GITHUB_WORKSPACE/test/genome" \
39+
--gtf "$GITHUB_WORKSPACE/test/genes.gtf"
40+
41+
- name: Verify outputs
42+
run: |
43+
test -f results/counts/gene_counts.txt
44+
test -f results/deseq2/deseq2_results.csv
45+
test -f results/deseq2/volcano_plot.png
46+
test -f results/deseq2/pca_plot.png
47+
test -f results/multiqc/multiqc_report.html
48+
echo "All outputs verified"

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@ results/
33
.nextflow/
44
.nextflow.log*
55
*.html
6+
data/
7+
genome/
8+
*.sra

assets/samplesheet.csv

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
sample_id,fastq_1,fastq_2,condition
2-
SRR11886868,data/SRR11886868_1.fastq.gz,data/SRR11886868_2.fastq.gz,positive
3-
SRR11886869,data/SRR11886869_1.fastq.gz,data/SRR11886869_2.fastq.gz,positive
4-
SRR11886870,data/SRR11886870_1.fastq.gz,data/SRR11886870_2.fastq.gz,positive
5-
SRR11886871,data/SRR11886871_1.fastq.gz,data/SRR11886871_2.fastq.gz,negative
6-
SRR11886872,data/SRR11886872_1.fastq.gz,data/SRR11886872_2.fastq.gz,negative
7-
SRR11886873,data/SRR11886873_1.fastq.gz,data/SRR11886873_2.fastq.gz,negative
2+
SRR1039508,data/SRR1039508_1.fastq.gz,data/SRR1039508_2.fastq.gz,untreated
3+
SRR1039509,data/SRR1039509_1.fastq.gz,data/SRR1039509_2.fastq.gz,dexamethasone
4+
SRR1039512,data/SRR1039512_1.fastq.gz,data/SRR1039512_2.fastq.gz,untreated
5+
SRR1039513,data/SRR1039513_1.fastq.gz,data/SRR1039513_2.fastq.gz,dexamethasone

main.nf

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ params.genome_index = null
1212
params.gtf = null
1313
params.outdir = "results"
1414
params.strandedness = 2 // 0=unstranded, 1=forward, 2=reverse
15+
params.ref_condition = "untreated" // DESeq2 reference level
1516

1617
// Validate required params
1718
if (!params.genome_index) { error "Provide --genome_index (path to HISAT2 index prefix)" }
@@ -191,20 +192,19 @@ process DESEQ2 {
191192
ss <- ss[match(colnames(counts), ss\$sample_id), ]
192193
stopifnot(all(colnames(counts) == ss\$sample_id))
193194
194-
# DESeq2
195-
dds <- DESeqDataSetFromMatrix(counts, ss, design = ~ condition)
196-
dds\$condition <- relevel(dds\$condition, ref = "negative")
197-
198-
# Filter low-count genes (keep genes with at least 1 count in 2+ samples)
199-
keep <- rowSums(counts >= 1) >= 2
195+
# Filter low-count genes (adaptive: require counts in at least half the samples)
196+
min_count <- ifelse(max(colSums(counts)) > 1e6, 10, 1)
197+
min_samples <- max(2, floor(ncol(counts) / 2))
198+
keep <- rowSums(counts >= min_count) >= min_samples
200199
counts <- counts[keep, , drop = FALSE]
201-
cat(sprintf("Genes passing filter: %d / %d\\n", sum(keep), length(keep)))
200+
cat(sprintf("Genes passing filter: %d (min_count=%d, min_samples=%d)\\n", sum(keep), min_count, min_samples))
202201
202+
# DESeq2
203203
dds <- DESeqDataSetFromMatrix(counts, ss, design = ~ condition)
204-
dds\$condition <- relevel(dds\$condition, ref = "negative")
204+
dds\$condition <- relevel(dds\$condition, ref = "${params.ref_condition}")
205205
206206
tryCatch({
207-
dds <- DESeq(dds, sfType = "poscounts")
207+
dds <- DESeq(dds)
208208
}, error = function(e) {
209209
# Fallback for small/test datasets where dispersion fitting fails
210210
cat("Note: using gene-wise dispersion estimates (expected for small test data)\\n")
@@ -213,7 +213,11 @@ process DESEQ2 {
213213
dispersions(dds) <<- mcols(dds)\$dispGeneEst
214214
dds <<- nbinomWaldTest(dds)
215215
})
216-
res <- results(dds, contrast = c("condition", "positive", "negative"), alpha = 0.05)
216+
# Get the non-reference condition level for contrast
217+
cond_levels <- levels(dds\$condition)
218+
treat_level <- cond_levels[cond_levels != "${params.ref_condition}"][1]
219+
cat(sprintf("Contrast: %s vs %s\\n", treat_level, "${params.ref_condition}"))
220+
res <- results(dds, contrast = c("condition", treat_level, "${params.ref_condition}"), alpha = 0.05)
217221
res <- res[order(res\$padj), ]
218222
219223
# Save results

nextflow.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ profiles {
2525
params.genome_index = "${projectDir}/test/genome"
2626
params.gtf = "${projectDir}/test/genes.gtf"
2727
params.strandedness = 0 // synthetic data is unstranded
28+
params.ref_condition = "negative"
2829

2930
process {
3031
cpus = 1

0 commit comments

Comments
 (0)