Add CI, airway dataset samplesheet, configurable reference level

Ekin-Kahraman · Ekin-Kahraman · commit 60ee740c5e50 · 2026-04-05T03:05:31.000+01:00
- GitHub Actions CI runs test profile on push/PR
- Samplesheet updated for Himes et al. airway dataset (GSE52778):
  4 samples, 2 untreated + 2 dexamethasone
- DESeq2 reference condition now configurable via params.ref_condition
- Adaptive gene filter: min_count=10 for real data, min_count=1 for
  synthetic test data (based on library size)
- Contrast auto-detects treatment vs reference level
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,48 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 21
+
+      - name: Install Nextflow
+        run: |
+          curl -s https://get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+
+      - name: Generate test data
+        run: python test/create_test_data.py
+
+      - name: Build test HISAT2 index
+        run: |
+          docker run --rm -v "$GITHUB_WORKSPACE/test:/data" \
+            quay.io/biocontainers/hisat2:2.2.1--hdbdd923_6 \
+            hisat2-build -q /data/genome.fa /data/genome
+
+      - name: Run pipeline (test profile)
+        run: |
+          nextflow run main.nf \
+            -profile test,docker \
+            --genome_index "$GITHUB_WORKSPACE/test/genome" \
+            --gtf "$GITHUB_WORKSPACE/test/genes.gtf"
+
+      - name: Verify outputs
+        run: |
+          test -f results/counts/gene_counts.txt
+          test -f results/deseq2/deseq2_results.csv
+          test -f results/deseq2/volcano_plot.png
+          test -f results/deseq2/pca_plot.png
+          test -f results/multiqc/multiqc_report.html
+          echo "All outputs verified"
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@ results/
 .nextflow/
 .nextflow.log*
 *.html
+data/
+genome/
+*.sra
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,7 +1,5 @@
 sample_id,fastq_1,fastq_2,condition
-SRR11886868,data/SRR11886868_1.fastq.gz,data/SRR11886868_2.fastq.gz,positive
-SRR11886869,data/SRR11886869_1.fastq.gz,data/SRR11886869_2.fastq.gz,positive
-SRR11886870,data/SRR11886870_1.fastq.gz,data/SRR11886870_2.fastq.gz,positive
-SRR11886871,data/SRR11886871_1.fastq.gz,data/SRR11886871_2.fastq.gz,negative
-SRR11886872,data/SRR11886872_1.fastq.gz,data/SRR11886872_2.fastq.gz,negative
-SRR11886873,data/SRR11886873_1.fastq.gz,data/SRR11886873_2.fastq.gz,negative
+SRR1039508,data/SRR1039508_1.fastq.gz,data/SRR1039508_2.fastq.gz,untreated
+SRR1039509,data/SRR1039509_1.fastq.gz,data/SRR1039509_2.fastq.gz,dexamethasone
+SRR1039512,data/SRR1039512_1.fastq.gz,data/SRR1039512_2.fastq.gz,untreated
+SRR1039513,data/SRR1039513_1.fastq.gz,data/SRR1039513_2.fastq.gz,dexamethasone
diff --git a/main.nf b/main.nf
@@ -12,6 +12,7 @@ params.genome_index = null
 params.gtf = null
 params.outdir = "results"
 params.strandedness = 2  // 0=unstranded, 1=forward, 2=reverse
+params.ref_condition = "untreated"  // DESeq2 reference level
 
 // Validate required params
 if (!params.genome_index) { error "Provide --genome_index (path to HISAT2 index prefix)" }
@@ -191,20 +192,19 @@ process DESEQ2 {
     ss <- ss[match(colnames(counts), ss\$sample_id), ]
     stopifnot(all(colnames(counts) == ss\$sample_id))
 
-    # DESeq2
-    dds <- DESeqDataSetFromMatrix(counts, ss, design = ~ condition)
-    dds\$condition <- relevel(dds\$condition, ref = "negative")
-
-    # Filter low-count genes (keep genes with at least 1 count in 2+ samples)
-    keep <- rowSums(counts >= 1) >= 2
+    # Filter low-count genes (adaptive: require counts in at least half the samples)
+    min_count <- ifelse(max(colSums(counts)) > 1e6, 10, 1)
+    min_samples <- max(2, floor(ncol(counts) / 2))
+    keep <- rowSums(counts >= min_count) >= min_samples
     counts <- counts[keep, , drop = FALSE]
-    cat(sprintf("Genes passing filter: %d / %d\\n", sum(keep), length(keep)))
+    cat(sprintf("Genes passing filter: %d (min_count=%d, min_samples=%d)\\n", sum(keep), min_count, min_samples))
 
+    # DESeq2
     dds <- DESeqDataSetFromMatrix(counts, ss, design = ~ condition)
-    dds\$condition <- relevel(dds\$condition, ref = "negative")
+    dds\$condition <- relevel(dds\$condition, ref = "${params.ref_condition}")
 
     tryCatch({
-        dds <- DESeq(dds, sfType = "poscounts")
+        dds <- DESeq(dds)
     }, error = function(e) {
         # Fallback for small/test datasets where dispersion fitting fails
         cat("Note: using gene-wise dispersion estimates (expected for small test data)\\n")
@@ -213,7 +213,11 @@ process DESEQ2 {
         dispersions(dds) <<- mcols(dds)\$dispGeneEst
         dds <<- nbinomWaldTest(dds)
     })
-    res <- results(dds, contrast = c("condition", "positive", "negative"), alpha = 0.05)
+    # Get the non-reference condition level for contrast
+    cond_levels <- levels(dds\$condition)
+    treat_level <- cond_levels[cond_levels != "${params.ref_condition}"][1]
+    cat(sprintf("Contrast: %s vs %s\\n", treat_level, "${params.ref_condition}"))
+    res <- results(dds, contrast = c("condition", treat_level, "${params.ref_condition}"), alpha = 0.05)
     res <- res[order(res\$padj), ]
 
     # Save results
diff --git a/nextflow.config b/nextflow.config
@@ -25,6 +25,7 @@ profiles {
         params.genome_index = "${projectDir}/test/genome"
         params.gtf = "${projectDir}/test/genes.gtf"
         params.strandedness = 0  // synthetic data is unstranded
+        params.ref_condition = "negative"
 
         process {
             cpus = 1