Skip to content

Commit 593754c

Browse files
authored
Merge pull request #137 from sunbeam-labs/dev
Low-complexity read removal and misc bug fixes
2 parents 3624a25 + e8ec075 commit 593754c

File tree

10 files changed

+106
-25
lines changed

10 files changed

+106
-25
lines changed

Readme.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Sunbeam is a pipeline written in [snakemake](http://snakemake.readthedocs.io)
99
that simplifies and automates many of the steps in metagenomic sequencing
1010
analysis. It uses [conda](http://conda.io) to manage dependencies, so it
1111
doesn't have pre-existing dependencies or admin privileges, and can be deployed
12-
on most Linux and Mac workstations and clusters.
12+
on most Linux workstations and clusters.
1313

1414
Sunbeam currently automates the following tasks:
1515

Snakefile

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# Author: Erik Clarke <ecl@mail.med.upenn.edu>
55
# Created: 2016-04-28
66
#
7-
7+
import os
88
import re
99
import sys
1010
import yaml
@@ -26,6 +26,14 @@ if not config:
2626
"No config file specified. Run `sunbeam init` to generate a "
2727
"config file, and specify with --configfile")
2828

29+
sunbeam_dir = ""
30+
try:
31+
sunbeam_dir = os.environ["SUNBEAM_DIR"]
32+
except KeyError:
33+
raise SystemExit(
34+
"$SUNBEAM_DIR environment variable not defined. Are you sure you're "
35+
"running this from the Sunbeam conda env?")
36+
2937
# Check for major version compatibility
3038
pkg_major, cfg_major = check_compatibility(config)
3139
if pkg_major > cfg_major:
@@ -40,7 +48,7 @@ elif pkg_major < cfg_major:
4048
"`sunbeam init` and update it using `sunbeam_mod_config`\n")
4149

4250
# Load extensions
43-
sbxs = list(listfiles("extensions/{sbx_folder}/{sbx}.rules"))
51+
sbxs = list(listfiles(sunbeam_dir+"/extensions/{sbx_folder}/{sbx}.rules"))
4452
for sbx in sbxs:
4553
sys.stderr.write("Found extension {sbx} in folder {sbx_folder}\n".format(**sbx[1]))
4654

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Sunbeam is a pipeline written in `snakemake <http://snakemake.readthedocs.io>`_
1818
that simplifies and automates many of the steps in metagenomic sequencing
1919
analysis. It uses `conda <http://conda.io>`_ to manage dependencies, so it
2020
doesn't have pre-existing dependencies or admin privileges, and can be deployed
21-
on most Linux and Mac workstations and clusters.
21+
on most Linux workstations and clusters.
2222

2323
Sunbeam currently automates the following tasks:
2424

docs/quickstart.rst

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@ Quickstart Guide
1010
Installation
1111
************
1212

13-
Download a copy of Sunbeam from our GitHub repository, and install.
13+
On a Linux machine, download a copy of Sunbeam from our GitHub repository, and
14+
install. We do not currently support non-Linux environments.
1415

1516
.. code-block:: shell
1617
17-
git clone -b stable https://github.com/eclarke/sunbeam sunbeam-stable
18+
git clone -b stable https://github.com/sunbeam-labs/sunbeam sunbeam-stable
1819
cd sunbeam-stable
1920
./install.sh
2021
tests/run_tests.bash -e sunbeam
@@ -31,7 +32,7 @@ runs some tests to make sure everything was installed correctly.
3132
PATH=$PATH:$HOME/miniconda3/bin` > ~/.bashrc``
3233

3334
If you see "Tests failed", check out our :ref:`troubleshooting` section or file an issue
34-
on our `GitHub <https://github.com/eclarke/sunbeam/issues>`_ page.
35+
on our `GitHub <https://github.com/sunbeam-labs/sunbeam/issues>`_ page.
3536

3637
Setup
3738
*****

docs/usage.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@ User Guide
77
.. contents::
88
:depth: 3
99

10+
Requirements
11+
============
12+
13+
- A relatively-recent Linux computer with more than 2Gb of RAM
14+
15+
We do not currently support Windows or Mac. (You may be able to run this on
16+
Windows using the [WSL](https://docs.microsoft.com/en-us/windows/wsl/about), but
17+
it has not been tested.
18+
1019
.. _installation:
1120
Installation
1221
============

rules/assembly/assembly.rules

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
#
33
# Contig building and other assembly rules
44
#
5-
# Requires Megahit and CAP3.
5+
# Requires Megahit.
66

77
rule all_assembly:
88
"""Build contigs for all samples."""
99
input:
1010
TARGET_ASSEMBLY
1111

12-
ruleorder: megahit_unpaired > megahit_paired
12+
ruleorder: megahit_paired > megahit_unpaired
1313

1414
rule megahit_paired:
1515
input:

rules/qc/qc.rules

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ rule all_qc:
77
input:
88
TARGET_QC
99

10-
ruleorder: adapter_removal_unpaired > adapter_removal_paired
10+
ruleorder: adapter_removal_paired > adapter_removal_unpaired
1111

1212
rule adapter_removal_unpaired:
1313
input:
@@ -79,7 +79,7 @@ rule adapter_removal_paired:
7979
ln -s {input.r1} {output.gr1} && ln -s {input.r2} {output.gr2}
8080
""")
8181

82-
ruleorder: trimmomatic_unpaired > trimmomatic_paired
82+
ruleorder: trimmomatic_paired > trimmomatic_unpaired
8383

8484
rule trimmomatic_unpaired:
8585
input:
@@ -149,21 +149,34 @@ rule fastqc:
149149
outdir = str(QC_FP/'reports')
150150
shell:
151151
"fastqc -o {params.outdir} {input.reads} -extract"
152-
152+
153+
rule find_low_complexity:
154+
input:
155+
expand(
156+
str(QC_FP/'02_trimmomatic'/'{{sample}}_{rp}.fastq.gz'),
157+
rp=Pairs)
158+
output:
159+
str(QC_FP/'log'/'komplexity'/'{sample}.filtered_ids')
160+
shell:
161+
"""
162+
for rp in {input}; do
163+
gzip -dc $rp | kz | \
164+
awk '{{ if ($4<{Cfg[qc][kz_threshold]}) print $1 }}' >> {output}
165+
done
166+
"""
167+
153168
rule remove_low_complexity:
154169
input:
155-
str(QC_FP/'02_trimmomatic'/'{sample}_{rp}.fastq.gz')
170+
reads = str(QC_FP/'02_trimmomatic'/'{sample}_{rp}.fastq.gz'),
171+
ids = str(QC_FP/'log'/'komplexity'/'{sample}.filtered_ids')
156172
output:
157173
str(QC_FP/'03_komplexity'/'{sample}_{rp}.fastq.gz')
158-
run:
159-
if Cfg['qc']['mask_low_complexity']:
160-
shell("""
161-
kz --mask -t {Cfg[qc][kz_threshold]} -w {Cfg[qc][kz_window]} \
162-
< <(gzip -cd {input}) | gzip -c > {output}
163-
""")
164-
else:
165-
shell("ln -s {input} {output}")
166-
174+
shell:
175+
"""
176+
gzip -dc {input.reads} | rbt fastq-filter {input.ids} |\
177+
gzip > {output}
178+
"""
179+
167180
rule qc_final:
168181
input:
169182
str(QC_FP/'03_komplexity'/'{sample}_{rp}.fastq.gz')

sunbeamlib/data/default_config.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,7 @@ qc:
3939
fwd_adapters: ['GTTTCCCAGTCACGATC', 'GTTTCCCAGTCACGATCNNNNNNNNNGTTTCCCAGTCACGATC']
4040
rev_adapters: ['GTTTCCCAGTCACGATC', 'GTTTCCCAGTCACGATCNNNNNNNNNGTTTCCCAGTCACGATC']
4141
# Komplexity
42-
mask_low_complexity: true
4342
kz_threshold: 0.55
44-
kz_window: 32
4543
# Decontam.py
4644
pct_id: 0.5
4745
frac: 0.6

tests/targets_singleend.txt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
annotation/blastn/bacteria/contig/dummybfragilis.xml
2+
annotation/blastn/bacteria/contig/dummyecoli.xml
3+
annotation/blastn/bacteria/contig/random.xml
4+
5+
annotation/summary/dummybfragilis.tsv
6+
annotation/summary/dummyecoli.tsv
7+
annotation/summary/random.tsv
8+
9+
assembly/contigs/dummybfragilis-contigs.fa
10+
assembly/contigs/dummyecoli-contigs.fa
11+
assembly/contigs/random-contigs.fa
12+
13+
classify/kraken/all_samples.biom
14+
classify/kraken/all_samples.tsv
15+
classify/kraken/dummybfragilis-taxa.tsv
16+
classify/kraken/dummyecoli-taxa.tsv
17+
classify/kraken/random-taxa.tsv
18+
19+
qc/decontam/dummybfragilis_1.fastq.gz
20+
qc/decontam/dummyecoli_1.fastq.gz
21+
qc/decontam/random_1.fastq.gz
22+
23+
qc/cleaned/dummybfragilis_1.fastq.gz
24+
qc/cleaned/dummyecoli_1.fastq.gz
25+
qc/cleaned/random_1.fastq.gz
26+
27+
mapping/human/coverage.csv
28+
mapping/phix174/coverage.csv

tests/test_suite.bash

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,31 @@ function test_version_check {
4343
fi
4444
}
4545

46-
# Test that we can
46+
# Test that we detect and run extensions
4747
function test_extensions {
4848
sunbeam run --configfile $TEMPDIR/tmp_config.yml sbx_test | grep "SBX_TEST"
4949
}
50+
51+
# Test that single-end sequencing configurations work
52+
function test_single_end {
53+
rm -rf $TEMPDIR/sunbeam_output/qc
54+
sunbeam config modify --str 'all: {paired_end: false}' \
55+
$TEMPDIR/tmp_config.yml > $TEMPDIR/single_end_config.yml
56+
sunbeam run --configfile $TEMPDIR/single_end_config.yml
57+
python tests/find_targets.py --prefix $TEMPDIR/sunbeam_output tests/targets_singleend.txt
58+
}
59+
60+
# Fix for #131
61+
# Test that paired-end qc rules produce files with the same number of reads
62+
function test_pair_concordance {
63+
rm -rf $TEMPDIR/sunbeam_output/qc
64+
sunbeam run --configfile $TEMPDIR/tmp_config.yml all_decontam
65+
for r1 in $TEMPDIR/sunbeam_output/qc/cleaned/*_1.fastq.gz; do
66+
r1_lines=$(zcat $r1 | wc -l)
67+
r2=${r1%_1.fastq.gz}_2.fastq.gz
68+
r2_lines=$(zcat $r2 | wc -l)
69+
if [ $r1_lines -ne $r2_lines ]; then
70+
exit 1
71+
fi
72+
done
73+
}

0 commit comments

Comments
 (0)