Skip to content

Commit ddee4ad

Browse files
authored
Improve testing (#19)
* Add script to simulate reads * We use a main branch on this repo, not master * Rename ncbi-acc-download install script in workflow, capture art logs * Install ncbi-acc-download to conda env * Install conda env under opt * Cannot use --name and --prefix together * Give full path to conda env * Use conda directly * Run conda directly from workflow * Rename step and add miniconda to path * Generate simulated reads * Rename art install script in workflow * Re-arrange workflow steps * Create file as placeholder * Add ART to PATH * Install rename * Use alternate rename syntax * Build Kraken & Bracken DBs * Build Bracken DB and cache Kraken2/Bracken DB * Use custom accession2taxid file to avoid running out of disk space * Limit disk usage during kraken db build * Download example InterOp and use kraken accession list * skip header * do not copy Data directory from example InterOp data * Edit RunInfo.xml to match InterOp data * Touch up pull-request script * Save all results to artifacts dir * Improve caching and artifacts * Could not find v2 * Use v2.1.3 * Separate installtion of system deps from miniconda * Adjust caching * Fix artifacts path * Add workflow for pushes to main branch * Adjust cache keys for caches that do not have good hash values * Do not require underscore after R1/R2 in fastq filenames * Use github-specific nextflow config while running tests * Separate ref genome list no longer necessary * Invoke nextflow to ensure all deps installed before caching * Use github config during initial nextflow run * Put config flag in correct location * Fix config file path * Fix config flag placement * Fix config file path * Cache ART installation * limit CPUs to 2 while testing * Set kraken2 cpus in config * Do not require underscore after R1/R2 in fastq filename * Cache ref genomes * Update * Update * Do not assume underscore after sample id * Invalidate ref genome cache * Check if output dir is already named correctly * Invalidate caches for ref genomes and kraken db * Fix if then fi bash syntax * Change read length to match simulated reads
1 parent f780a5c commit ddee4ad

23 files changed

+391
-52
lines changed

.github/config/nextflow.config

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
params {
2+
kraken2_db = ".github/data/kraken2_db"
3+
bracken_db = ".github/data/kraken2_db"
4+
instrument_type = "miseq"
5+
}
6+
7+
profiles {
8+
conda {
9+
process.conda = "$baseDir/environments/environment.yml"
10+
if (params.cache){
11+
conda.cacheDir = params.cache
12+
}
13+
}
14+
}
15+
16+
17+
// Capture exit codes from upstream processes when piping
18+
process.shell = ['/bin/bash', '-euo', 'pipefail']
19+
20+
process.executor = 'local'
21+
process.cpus = 2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
accession accession.version taxid gi
2+
NC_002695 NC_002695.2 386585 1447699251
3+
NC_016845 NC_016845.1 1125630 378976159
4+
NZ_CP033744 NZ_CP033744.1 546 1520496619
5+
NC_003197 NC_003197.2 99287 1109557564

.github/data/mock_runs/210101_M00000_0000_000000000-A1B2C/Data/Intensities/BaseCalls/placeholder

Whitespace-only changes.

.github/data/mock_runs/210101_M00000_0000_000000000-A1B2C/SampleSheet.csv

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ Assay,Illumina DNA Prep
1010
Index Adapters,IDT-Ilmn DNA-RNA UD Indexes SetA Tagmentation
1111
Chemistry,Amplicon
1212
[Reads]
13-
151
14-
151
13+
251
14+
251
1515
[Settings]
1616
ReverseComplement,0
1717
Adapter,CTGTCTCTTATACACATCT

.github/data/ref_genome_list.txt

-4
This file was deleted.

.github/scripts/create_bracken_db.sh

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
set -eo pipefail
4+
5+
export PATH=/opt/miniconda3/bin:$PATH
6+
7+
pushd ${PWD}/.github/data
8+
9+
bracken-build -d kraken2_db -l 250 > bracken_build.log
10+
11+
kraken2-build --clean --db kraken2_db
12+
13+
popd
14+
15+
cp ${PWD}/.github/data/bracken_build.log artifacts

.github/scripts/create_kraken2_db.sh

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
3+
set -eo pipefail
4+
5+
export PATH=/opt/miniconda3/bin:$PATH
6+
7+
pushd ${PWD}/.github/data/kraken2_db/taxonomy
8+
9+
rsync --no-motd rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz .
10+
11+
tar -xzf taxdump.tar.gz && rm taxdump.tar.gz
12+
13+
pushd ../..
14+
15+
for file in ref_genomes/*.fa; do
16+
kraken2-build --add-to-library ${file} --db kraken2_db
17+
done
18+
19+
kraken2-build --build --db kraken2_db > kraken_build.log
20+
21+
popd && popd
22+
23+
cp ${PWD}/.github/data/kraken_build.log artifacts
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/bin/bash
2+
3+
set -eo pipefail
4+
5+
pushd ${PWD}/.github/data
6+
7+
# Only publicly-available InterOp data I've found...
8+
wget http://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz
9+
10+
tar -xzf cellranger-tiny-bcl-1.2.0.tar.gz && rm cellranger-tiny-bcl-1.2.0.tar.gz
11+
12+
mv cellranger-tiny-bcl-1.2.0/InterOp mock_runs/210101_M00000_0000_000000000-A1B2C
13+
mv cellranger-tiny-bcl-1.2.0/RunInfo.xml mock_runs/210101_M00000_0000_000000000-A1B2C
14+
mv cellranger-tiny-bcl-1.2.0/runParameters.xml mock_runs/210101_M00000_0000_000000000-A1B2C
15+
16+
rm -r cellranger-tiny-bcl-1.2.0
17+
18+
# RunInfo.xml doesn't match InterOp data for some reason, so
19+
# edit RunInfo.xml to match
20+
cat mock_runs/210101_M00000_0000_000000000-A1B2C/RunInfo.xml | \
21+
sed 's/LaneCount="1"/LaneCount="2"/' | \
22+
sed 's/SurfaceCount="1"/SurfaceCount="2"/' | \
23+
sed 's/SwathCount="1"/SwathCount="2"/' | \
24+
sed 's/TileCount="1"/TileCount="32"/' \
25+
> RunInfo.edited.xml
26+
27+
mv RunInfo.edited.xml mock_runs/210101_M00000_0000_000000000-A1B2C/RunInfo.xml
28+
29+
popd

.github/scripts/download_reference_genomes.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,17 @@
22

33
set -eo pipefail
44

5+
export PATH=/opt/miniconda3/bin:${PATH}
6+
57
echo "Download Reference Genomes..." >> artifacts/test_artifact.log
68

79
mkdir -p $PWD/.github/data/ref_genomes
810

911
pushd $PWD/.github/data/ref_genomes
1012

11-
while read -r accession; do
12-
ncbi-acc-download --format fasta ${accession}
13+
while IFS=$'\t' read -r accession accession_version taxid gi ; do
14+
ncbi-acc-download --format fasta ${accession_version}
1315
sleep 5
14-
done < ../ref_genome_list.txt
16+
done < <(tail -n+2 ../kraken2_db/taxonomy/nucl_gb.accession2taxid)
1517

1618
popd
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
3+
set -eo pipefail
4+
5+
export PATH=/opt/art/bin/:${PATH}
6+
7+
pushd ${PWD}/.github/data/mock_runs/210101_M00000_0000_000000000-A1B2C/Data/Intensities/BaseCalls/
8+
9+
art_illumina --seqSys MSv3 --paired -i ../../../../../ref_genomes/NC_002695.2.fa --fcov 5 --mflen 500 --sdev 25 --len 250 --noALN -o test-01_R > ../../../../../../../artifacts/test-01_read_generation_log.txt
10+
11+
art_illumina --seqSys MSv3 --paired -i ../../../../../ref_genomes/NC_016845.1.fa --fcov 5 --mflen 500 --sdev 25 --len 250 --noALN -o test-02_R > ../../../../../../../artifacts/test-02_read_generation_log.txt
12+
13+
art_illumina --seqSys MSv3 --paired -i ../../../../../ref_genomes/NZ_CP033744.1.fa --fcov 5 --mflen 500 --sdev 25 --len 250 --noALN -o test-03_R > ../../../../../../../artifacts/test-03_read_generation_log.txt
14+
15+
art_illumina --seqSys MSv3 --paired -i ../../../../../ref_genomes/NC_003197.2.fa --fcov 5 --mflen 500 --sdev 25 --len 250 --noALN -o negative-control_R > ../../../../../../../artifacts/negative-control_read_generation_log.txt
16+
17+
rename s/fq/fastq/ *.fq
18+
19+
gzip *.fastq
20+
21+
popd

.github/scripts/install_art.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ set -eo pipefail
44

55
echo "Install ART .." >> artifacts/test_artifact.log
66

7-
wget https://www.niehs.nih.gov/research/resources/assets/docs/artbinmountrainier2016.06.05linux64.tgz && tar -xzf artbin*
7+
wget --quiet https://www.niehs.nih.gov/research/resources/assets/docs/artbinmountrainier2016.06.05linux64.tgz && tar -xzf artbin*
88

99
mkdir -p /opt/art/bin
1010

.github/scripts/install_conda.sh

+1-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@ set -eo pipefail
44
echo "Install Miniconda .." >> artifacts/test_artifact.log
55

66
export PATH=/opt/miniconda3/bin:$PATH
7-
sudo apt-get update --fix-missing && sudo apt-get install -y wget bzip2 ca-certificates \
8-
libglib2.0-0 libxext6 libsm6 libxrender1 \
9-
git mercurial subversion
7+
108
wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
119
/bin/bash ~/miniconda.sh -b -p /opt/miniconda3 && \
1210
rm ~/miniconda.sh && \
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
set -eo pipefail
4+
5+
echo "Install NCBI Genome Download tool using conda.." >> artifacts/test_artifact.log
6+
7+
conda create --yes --prefix /opt/miniconda3/envs/ncbi-acc-download

.github/scripts/install_nextflow.sh

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,16 @@ set -eo pipefail
33

44
echo Install Nextflow .. >> artifacts/test_artifact.log
55

6-
wget -qO- https://get.nextflow.io | bash
6+
wget -qO- https://get.nextflow.io | bash > nextflow_install.out 2> nextflow_install.err
77

88
mkdir -p /opt/nextflow/bin
99

1010
mv nextflow /opt/nextflow/bin
1111

1212
echo "export PATH=/opt/nextflow/bin:$PATH" >> ~/.bashrc
13+
14+
export PATH=/opt/nextflow/bin:$PATH
15+
16+
NXF_VER=20.10.0 nextflow -C .github/config/nextflow.config run hello
17+
18+
mv nextflow_install.out nextflow_install.err artifacts
+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
set -eo pipefail
3+
4+
echo "Install System Dependencies .." >> artifacts/test_artifact.log
5+
6+
sudo apt-get update --fix-missing
7+
8+
sudo apt-get install -y \
9+
wget \
10+
bzip2 \
11+
ca-certificates \
12+
libglib2.0-0 \
13+
libxext6 \
14+
libsm6 libxrender1 \
15+
git \
16+
mercurial \
17+
subversion \
18+
rename

.github/scripts/test_against_previous_release.sh

+19-20
Original file line numberDiff line numberDiff line change
@@ -7,37 +7,35 @@ export PATH=/opt/nextflow/bin:$PATH
77

88
# write test log as github Action artifact
99
echo "Nextflow run current PR..." >> artifacts/test_artifact.log
10-
NXF_VER=20.10.0 nextflow -quiet run ./main.nf \
10+
NXF_VER=20.10.0 nextflow -C ${PWD}/.github/config/nextflow.config -quiet run ./main.nf \
1111
-profile conda \
12-
--cache ~/.conda/envs \
13-
--kraken2_db $PWD/.github/data/___ \
14-
--bracken_db $PWD/.github/data/___ \
15-
--run_dir $PWD/.github/data/___ \
16-
--outdir pr_output
12+
--cache ${HOME}/.conda/envs \
13+
--kraken2_db ${PWD}/.github/data/kraken2_db \
14+
--bracken_db ${PWD}/.github/data/kraken2_db \
15+
--run_dir ${PWD}/.github/data/mock_runs/210101_M00000_0000_000000000-A1B2C \
16+
--outdir results
1717

18-
cp .nextflow.log artifacts/
18+
cp .nextflow.log artifacts/pull_request.nextflow.log
19+
cp -r results artifacts/pull_request_results
1920

2021
# run tests against previous previous_release to compare outputs
2122
git clone https://github.com/BCCDC-PHL/routine-sequence-qc.git previous_release
22-
cd previous_release
23-
git checkout 26220ef1217229beb73393e74c56a57ea90150bf
23+
pushd previous_release
24+
git checkout adfffed374ae0212b707b042233652704286b4d7 -b previous-release
2425

2526
echo "Nextflow run previous release..." >> ../artifacts/test_artifact.log
26-
NXF_VER=20.10.0 nextflow -quiet run ./main.nf \
27+
NXF_VER=20.10.0 nextflow -C ${PWD}/../.github/config/nextflow.config -quiet run ./main.nf \
2728
-profile conda \
28-
--cache ~/.conda/envs \
29-
--directory $PWD/../.github/data/fastqs/ \
30-
--ref $PWD/../.github/data/refs/MN908947.3/MN908947.3.fa \
31-
--bed $PWD/../.github/data/primer_schemes/nCoV-2019_Freed_1200bp.bed \
32-
--primer_pairs_tsv $PWD/../.github/data/primer_schemes/nCoV-2019_Freed_1200bp_primer_pairs.tsv \
33-
--gff $PWD/../.github/data/refs/MN908947.3.gff \
34-
--composite_ref $PWD/../.github/data/refs/mock_composite_ref/mock_composite_ref.fa \
35-
--illumina \
36-
--prefix test
29+
--cache ${HOME}/.conda/envs \
30+
--kraken2_db ${PWD}/../.github/data/kraken2_db \
31+
--bracken_db ${PWD}/../.github/data/kraken2_db \
32+
--run_dir ${PWD}/../.github/data/mock_runs/210101_M00000_0000_000000000-A1B2C \
33+
--outdir results
3734

3835
cp .nextflow.log ../artifacts/previous_release.nextflow.log
36+
cp -r results ../artifacts/previous_release_results
3937

40-
cd ..
38+
popd
4139

4240
# exclude files from comparison
4341
# and list differences
@@ -48,6 +46,7 @@ find results ./previous_release/results \
4846
-o -name "*.bam.bai" \
4947
-o -name "*.vcf" \
5048
| xargs rm -rf
49+
5150
if ! git diff --stat --no-index results ./previous_release/results > diffs.txt ; then
5251
echo "test failed: differences found between PR and previous release" >> artifacts/test_artifact.log
5352
echo "see diffs.txt" >> artifacts/test_artifact.log

0 commit comments

Comments
 (0)