databio
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 81 additions & 0 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 48 additions & 1 deletion b/‎Makefile‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎pipelines/peppro.py‎
Lines changed: 8 additions & 2 deletions b/‎pipelines/peppro.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎sample_pipeline_interface.yaml‎
Lines changed: 1 addition & 0 deletions b/‎sample_pipeline_interface.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/README.md‎
Lines changed: 166 additions & 0 deletions b/‎tests/README.md‎
Lines changed: 166 additions & 0 deletions
diff --git a/‎tests/looper_configs/.looper_pe_basic.yaml‎
Lines changed: 10 additions & 0 deletions b/‎tests/looper_configs/.looper_pe_basic.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎tests/looper_configs/.looper_pe_umi.yaml‎
Lines changed: 10 additions & 0 deletions b/‎tests/looper_configs/.looper_pe_umi.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎tests/looper_configs/.looper_se_basic.yaml‎
Lines changed: 10 additions & 0 deletions b/‎tests/looper_configs/.looper_se_basic.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎tests/looper_configs/.looper_se_coverage.yaml‎
Lines changed: 10 additions & 0 deletions b/‎tests/looper_configs/.looper_se_coverage.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎tests/looper_configs/.looper_se_fastp.yaml‎
Lines changed: 10 additions & 0 deletions b/‎tests/looper_configs/.looper_se_fastp.yaml‎
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,81 @@
+name: Tests
+
+on:
+  push:
+    branches: [master, dev, looper-update]
+  pull_request:
+    branches: [master, dev]
+  workflow_dispatch:
+    inputs:
+      run_integration:
+        description: "Run integration tests (requires self-hosted runner)"
+        required: false
+        default: "false"
+
+jobs:
+  # --------------------------------------------------------------------------
+  # Tier 1: Unit tests — no genome data or bioinformatics tools required.
+  # Runs on every push and pull request.
+  # --------------------------------------------------------------------------
+  unit-tests:
+    name: Unit tests (Python ${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Python dependencies
+        run: pip install -r requirements.txt pytest
+
+      - name: Run unit tests
+        run: pytest tests/test_unit.py -v --tb=short
+
+  # --------------------------------------------------------------------------
+  # Tier 2: Integration tests — full pipeline runs.
+  # Requires a self-hosted runner with genome indices and tools installed.
+  # Triggered manually via workflow_dispatch or by setting
+  # RUN_INTEGRATION_TESTS=true in the environment.
+  # --------------------------------------------------------------------------
+  integration-tests:
+    name: Integration tests (${{ matrix.scenario }})
+    if: >
+      github.event_name == 'workflow_dispatch' &&
+      github.event.inputs.run_integration == 'true'
+    runs-on: self-hosted
+    strategy:
+      fail-fast: false
+      matrix:
+        scenario:
+          - se_basic
+          - pe_basic
+          - se_groseq
+          - se_umi
+          - pe_umi
+          - se_fastp
+          - se_fastx
+          - se_fqdedup
+          - se_scale
+          - se_no_complexity
+          - se_nofifo
+          - se_coverage
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Python dependencies
+        run: pip install -r requirements.txt pytest
+
+      - name: Run integration test for ${{ matrix.scenario }}
+        env:
+          RUN_INTEGRATION_TESTS: "true"
+        run: >
+          pytest tests/test_integration.py -v --tb=short
+          -k "${{ matrix.scenario }}"
@@ -1,5 +1,52 @@
 test:
-	python pipelines/peppro.py  -P 3 -M 100 -O peppro_test -R -S test -G hg38  -Q single  -C peppro.yaml  --genome-size hs --prealignments rCRSd human_repeats -I examples/data/test_R1.fq.gz
+	python pipelines/peppro.py -P 3 -M 100 -O peppro_test -R -S test -G hg38 \
+		-Q single -C peppro.yaml \
+		--protocol PRO \
+		--prealignment-names rCRSd human_repeats \
+		--genome-index $$(refgenie seek hg38/bowtie2_index --seek-key dir) \
+		--chrom-sizes $$(refgenie seek hg38/fasta --seek-key chrom_sizes) \
+		--pipestat-schema peppro_output_schema.yaml \
+		-I examples/data/test_r1.fq.gz
+
+# -----------------------------------------------------------------------
+# Test suite targets
+# -----------------------------------------------------------------------
+
+# Run only unit tests (no genome data or external tools required)
+test-unit:
+	pytest tests/test_unit.py -v --tb=short
+
+# Run a single integration scenario (e.g. make test-se SCENARIO=se_basic)
+SCENARIO ?= se_basic
+test-scenario:
+	RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short -k "$(SCENARIO)"
+
+# Run all SE integration scenarios
+test-se:
+	RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short \
+		-k "se_basic or se_groseq or se_umi or se_fastp or se_fastx or se_fqdedup or se_scale or se_no_complexity or se_nofifo or se_coverage"
+
+# Run all PE integration scenarios
+test-pe:
+	RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short \
+		-k "pe_basic or pe_umi"
+
+# Run recovery regression tests
+test-recovery:
+	RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short \
+		-k "recovery"
+
+# Run all integration tests (SE + PE + recovery)
+test-integration:
+	RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short
+
+# Run both unit and integration tests
+test-all:
+	RUN_INTEGRATION_TESTS=true pytest tests/ -v --tb=short
+
+# Regenerate test FASTQ data files from the source R1 read file
+test-data:
+	bash tests/scripts/generate_test_data.sh
 
 docker:
 	docker build -t databio/peppro -f containers/peppro.Dockerfile .
 
@@ -173,6 +173,10 @@ def parse_arguments():
                         dest="complexity",
                         help="Disable library complexity calculation (faster).")
 
+    parser.add_argument("--no-bw", action='store_true', default=False,
+                        dest="no_bw",
+                        help="Skip bigWig signal track generation (faster, for testing).")
+
     parser.add_argument("--prioritize", action='store_true', default=False,
                         dest="prioritize",
                         help="Plot cFRiF/FRiF using mutually exclusive priority"
@@ -3906,8 +3910,10 @@ def count_unmapped_reads():
         signal_folder, args.sample_name + "_minus_exact_body_0-mer.bw")
     minus_smooth_bw = os.path.join(
         signal_folder, args.sample_name + "_minus_smooth_body_0-mer.bw")
-    
-    if not args.sob:
+
+    if args.no_bw:
+        pm.timestamp("### Skipping bigWig generation (--no-bw)")
+    elif not args.sob:
         # If not scaling we don't need to use seqOutBias to generate the
         # separate strand bigWigs; just convert the BAM's directly with 
         # bamSitesToWig.py which uses UCSC wigToBigWig
 
@@ -40,6 +40,7 @@ sample_interface:
     {% if sample.keep_mito is defined %} --keep-mito {% endif %}
     {% if sample.no_fifo is defined %} --noFIFO {% endif %}
     {% if sample.complexity is defined %} --no-complexity {% endif %}
+    {% if sample.no_bw is defined %} --no-bw {% endif %}
     {% if sample.prioritize is defined %} --prioritize {% endif %}
     {% if sample.config_file is defined %} -C {sample.config_file} {% endif %}
     --pipestat-config {pipestat.config_file}
 
@@ -0,0 +1,166 @@
+# PEPPRO Test Suite
+
+This directory contains the PEPPRO test suite, organized into two tiers:
+
+- **Unit tests** — fast, no genome data or external bioinformatics tools required; run on every push/PR via GitHub Actions
+- **Integration tests** — full pipeline runs; require a self-hosted runner with genome indices and all tools installed
+
+---
+
+## Directory Structure
+
+```
+tests/
+├── data/                       # Small test FASTQ files (~3 MB total)
+│   ├── test_R1.fastq.gz        # SE reads (12,500 reads)
+│   ├── test_R2.fastq.gz        # PE reverse reads (rev-comp of R1)
+│   └── test_R1_umi.fastq.gz    # R1 with 8-nt UMI prefix for UMI tests
+├── pep_configs/                # PEP project configs for each scenario
+│   ├── se_basic.yaml / .csv
+│   ├── pe_basic.yaml / .csv
+│   └── ...
+├── looper_configs/             # Looper run configs for each scenario
+│   ├── .looper_se_basic.yaml
+│   └── ...
+├── scripts/
+│   └── generate_test_data.sh   # Regenerate test FASTQ data from source
+├── test_unit.py                # Unit tests (no tools/genome needed)
+├── test_integration.py         # Integration tests (full pipeline runs)
+└── README.md                   # This file
+```
+
+---
+
+## Unit Tests
+
+Unit tests cover:
+
+- **Constants**: `RUNON_SOURCE`, `ADAPTER_REMOVERS`, `TRIMMERS`, `DEDUPLICATORS` values and defaults
+- **PEP loading**: Each test config loads correctly with expected sample attributes
+- **Schema validation**: eido validation passes for valid configs; regression tests ensure invalid inputs (e.g., integer `umi_len` in YAML `imply`, invalid `protocol`/`adapter`/`trimmer`/`dedup` enum values) fail correctly
+- **Argument parsing**: All CLI flags parse correctly, defaults are correct, invalid choices raise `SystemExit`
+- **Recovery paths**: Expected output file naming conventions are documented and verified
+
+### Running unit tests
+
+```bash
+# Via pytest directly
+pytest tests/test_unit.py -v
+
+# Via Makefile
+make test-unit
+```
+
+No environment variables or external tools are needed.
+
+---
+
+## Integration Tests
+
+Integration tests run the full PEPPRO pipeline for each scenario and verify:
+
+1. Pipeline exits with status `0`
+2. Key output files exist (BAM, bigWig, stats.yaml)
+3. `stats.yaml` contains the expected result keys
+4. The `TestRecovery` class additionally tests checkpoint skipping and the `unmap_R1.fq` recovery regression
+
+### Prerequisites
+
+The integration tests require a machine with all PEPPRO dependencies installed and genome assets configured via refgenie:
+
+| Tool | Version tested |
+|------|---------------|
+| bowtie2 | ≥2.4 |
+| samtools | ≥1.13 |
+| bedtools | ≥2.30 |
+| cutadapt | ≥4.0 |
+| fastp | ≥0.23 |
+| seqtk | ≥1.3 |
+| fastx_toolkit | any |
+| seqkit | ≥2.0 |
+| fqdedup | any |
+| fastq_pair | any |
+| wigToBigWig | UCSC |
+| bedGraphToBigWig | UCSC |
+
+**Genome assets** (via refgenie, pointed to by `$REFGENIE`):
+
+- `hg38/bowtie2_index`
+- `human_rDNA/bowtie2_index`
+- `hg38/fasta` (for chromosome sizes)
+- `hg38/blacklist` (optional, for coverage tests)
+
+### Running integration tests
+
+Tests run with `-p local` (divvy local compute package) so the pipeline
+executes inline on the current node rather than being submitted to a job
+scheduler. Run integration tests from a compute node or interactive session
+if your cluster policy prohibits CPU-intensive work on login nodes.
+
+```bash
+# Enable integration tests
+export RUN_INTEGRATION_TESTS=true
+
+# Run a specific scenario
+pytest tests/test_integration.py -v -k se_basic
+
+# Via Makefile targets
+make test-se          # All SE scenarios
+make test-pe          # All PE scenarios
+make test-recovery    # Recovery regression tests
+make test-integration # All integration tests
+make test-all         # Unit + integration
+
+# Run a single named scenario
+make test-scenario SCENARIO=se_fastp
+
+# Keep output directories for debugging (default: cleaned up after each class)
+KEEP_TEST_OUTPUTS=true RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v -k se_basic
+```
+
+---
+
+## Test Scenarios
+
+| Scenario | Read type | Protocol | Adapter | Trimmer | Dedup | Notes |
+|----------|-----------|----------|---------|---------|-------|-------|
+| `se_basic` | SE | PRO-seq | cutadapt | seqtk | — | Baseline SE run |
+| `pe_basic` | PE | PRO-seq | cutadapt | seqtk | — | Baseline PE run |
+| `se_groseq` | SE | GRO-seq | cutadapt | seqtk | — | GRO-seq protocol |
+| `se_umi` | SE | PRO-seq | cutadapt | seqtk | seqkit | 8-nt UMI deduplication |
+| `pe_umi` | PE | PRO-seq | cutadapt | seqtk | seqkit | PE with UMI dedup |
+| `se_fastp` | SE | PRO-seq | fastp | seqtk | — | fastp adapter trimming |
+| `se_fastx` | SE | PRO-seq | cutadapt | fastx | — | fastx_trimmer |
+| `se_fqdedup` | SE | PRO-seq | cutadapt | seqtk | fqdedup | fqdedup UMI dedup |
+| `se_scale` | SE | PRO-seq | cutadapt | seqtk | — | `--scale` flag |
+| `se_no_complexity` | SE | PRO-seq | cutadapt | seqtk | — | `--no-complexity` flag |
+| `se_nofifo` | SE | PRO-seq | cutadapt | seqtk | — | `--no-fifo` flag |
+| `se_coverage` | SE | PRO-seq | cutadapt | seqtk | — | `--coverage` flag |
+
+---
+
+## Test Data
+
+The files in `tests/data/` are derived from `examples/data/test_r1.fq.gz` (the existing pipeline example read file). They are small enough to commit to the repository (~1 MB each).
+
+To regenerate the test data files (requires `seqtk`):
+
+```bash
+make test-data
+# or
+bash tests/scripts/generate_test_data.sh
+```
+
+---
+
+## GitHub Actions
+
+Unit tests run automatically on every push and pull request targeting `master` or `dev`, across Python 3.9, 3.11, and 3.12.
+
+Integration tests are triggered manually via **workflow_dispatch** on a self-hosted runner:
+
+1. Go to **Actions** → **Tests** → **Run workflow**
+2. Set "Run integration tests" to `true`
+3. Click **Run workflow**
+
+See `.github/workflows/tests.yml` for the full configuration.
@@ -0,0 +1,10 @@
+pep_config: "../pep_configs/pe_basic.yaml"
+
+output_dir: "${HOME}/peppro_test_pe_basic"
+
+pipeline_interfaces:
+  - "../../sample_pipeline_interface.yaml"
+  - "../../project_pipeline_interface.yaml"
+
+pipestat:
+  results_file_path: "${HOME}/peppro_test_pe_basic/results_pipeline/{record_identifier}/stats.yaml"
@@ -0,0 +1,10 @@
+pep_config: "../pep_configs/pe_umi.yaml"
+
+output_dir: "${HOME}/peppro_test_pe_umi"
+
+pipeline_interfaces:
+  - "../../sample_pipeline_interface.yaml"
+  - "../../project_pipeline_interface.yaml"
+
+pipestat:
+  results_file_path: "${HOME}/peppro_test_pe_umi/results_pipeline/{record_identifier}/stats.yaml"
@@ -0,0 +1,10 @@
+pep_config: "../pep_configs/se_basic.yaml"
+
+output_dir: "${HOME}/peppro_test_se_basic"
+
+pipeline_interfaces:
+  - "../../sample_pipeline_interface.yaml"
+  - "../../project_pipeline_interface.yaml"
+
+pipestat:
+  results_file_path: "${HOME}/peppro_test_se_basic/results_pipeline/{record_identifier}/stats.yaml"
@@ -0,0 +1,10 @@
+pep_config: "../pep_configs/se_coverage.yaml"
+
+output_dir: "${HOME}/peppro_test_se_coverage"
+
+pipeline_interfaces:
+  - "../../sample_pipeline_interface.yaml"
+  - "../../project_pipeline_interface.yaml"
+
+pipestat:
+  results_file_path: "${HOME}/peppro_test_se_coverage/results_pipeline/{record_identifier}/stats.yaml"
@@ -0,0 +1,10 @@
+pep_config: "../pep_configs/se_fastp.yaml"
+
+output_dir: "${HOME}/peppro_test_se_fastp"
+
+pipeline_interfaces:
+  - "../../sample_pipeline_interface.yaml"
+  - "../../project_pipeline_interface.yaml"
+
+pipestat:
+  results_file_path: "${HOME}/peppro_test_se_fastp/results_pipeline/{record_identifier}/stats.yaml"