Skip to content

Commit 722b9df

Browse files
Merge pull request #28 from EBI-Metagenomics/fix_seqprep
Fix seqprep
2 parents ebf8b99 + 2107195 commit 722b9df

File tree

11 files changed

+210
-77
lines changed

11 files changed

+210
-77
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM alpine:3.7
2+
3+
LABEL maintainer="Ekaterina Sakharova <[email protected]>"
4+
##############################################################
5+
# Dockerfile Version: 19.03.1
6+
# Software: seqtk + bash wrapper
7+
# Software Version: 1.3 (r106)
8+
# Description: filter reads < LEN bp from fastq
9+
# paired-end files
10+
##############################################################
11+
RUN apk add --no-cache bash wget gzip build-base zlib-dev
12+
13+
# install seqtk
14+
ENV VERSION=1.3
15+
RUN wget https://github.com/lh3/seqtk/archive/v$VERSION.zip && \
16+
unzip v$VERSION.zip && \
17+
cd seqtk-$VERSION && make
18+
19+
# add wrapper
20+
COPY filter_paired_reads.sh /tools/
21+
RUN chmod a+x /tools/*
22+
23+
ENV PATH="/seqtk-$VERSION:/tools:${PATH}"
24+
25+
CMD ["filter_paired_reads.sh"]
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env cwl-runner
2+
cwlVersion: v1.0
3+
class: CommandLineTool
4+
5+
label: "remove reads from both files that are less than LEN"
6+
7+
requirements:
8+
ResourceRequirement:
9+
coresMax: 1
10+
ramMin: 200
11+
12+
inputs:
13+
forward:
14+
type: File
15+
format: edam:format_1930
16+
inputBinding:
17+
prefix: -f
18+
reverse:
19+
type: File
20+
format: edam:format_1930
21+
inputBinding:
22+
prefix: -r
23+
len:
24+
type: int
25+
inputBinding:
26+
prefix: -l
27+
28+
baseCommand: [filter_paired_reads.sh]
29+
30+
outputs:
31+
forward_filtered:
32+
type: File
33+
format: edam:format_1930
34+
outputBinding:
35+
glob: forward_filt.fastq
36+
reverse_filtered:
37+
type: File
38+
format: edam:format_1930
39+
outputBinding:
40+
glob: reverse_filt.fastq
41+
42+
hints:
43+
- class: DockerRequirement
44+
dockerPull: microbiomeinformatics/pipeline-v5.filter-paired
45+
46+
47+
$namespaces:
48+
edam: http://edamontology.org/
49+
s: http://schema.org/
50+
$schemas:
51+
- http://edamontology.org/EDAM_1.16.owl
52+
- https://schema.org/version/latest/schemaorg-current-http.rdf
53+
54+
s:license: "https://www.apache.org/licenses/LICENSE-2.0"
55+
s:copyrightHolder: "EMBL - European Bioinformatics Institute"
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
while getopts :f:r:l: option; do
6+
case "${option}" in
7+
f) FORWARD=${OPTARG};;
8+
r) REVERSE=${OPTARG};;
9+
l) LEN=${OPTARG};;
10+
esac
11+
done
12+
13+
gunzip -c ${FORWARD} > forward.fastq
14+
gunzip -c ${REVERSE} > reverse.fastq
15+
16+
seqtk comp forward.fastq | awk -v l="${LEN}" '{ if ($2 >= l) { print} }' | cut -f1 > selected_1
17+
seqtk comp reverse.fastq | awk -v l="${LEN}" '{ if ($2 >= l) { print} }' | cut -f1 > selected_2
18+
19+
comm -12 selected_1 selected_2 > common
20+
21+
seqtk subseq forward.fastq common > forward_filt.fastq
22+
seqtk subseq reverse.fastq common > reverse_filt.fastq
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
$namespaces:
2+
edam: http://edamontology.org/
3+
4+
forward:
5+
class: File
6+
path: ../../../input_examples/amplicon-paired-ERR2237853_1.fastq.gz
7+
format: edam:format_1930
8+
9+
reverse:
10+
class: File
11+
path: ../../../input_examples/amplicon-paired-ERR2237853_2.fastq.gz
12+
format: edam:format_1930
13+
len: 100

tools/SeqPrep/seqprep.cwl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ inputs:
2929
label: second read input fastq
3030
inputBinding:
3131
prefix: -r
32+
name: string
3233

3334
baseCommand: SeqPrep
3435

@@ -38,7 +39,7 @@ arguments:
3839
- "-2"
3940
- reverse_unmerged.fastq.gz
4041
- valueFrom: |
41-
${ return inputs.forward_reads.nameroot.split('_')[0] + '_MERGED.fastq.gz' }
42+
${ return inputs.name + '_MERGED.fastq.gz' }
4243
prefix: "-s"
4344
# - "-3"
4445
# - forward_discarded.fastq.gz

utils/multiple-gunzip.cwl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
cwlVersion: v1.0
33

44
class: CommandLineTool
5-
label: "merges output of seqprep and unzips for paired end reads, or unzips file for single end"
5+
label: "unzip files"
66
requirements:
77
ResourceRequirement:
88
coresMin: 1
9-
ramMin: 200 # just a default, could be lowered
9+
ramMin: 2000 # just a default, could be lowered
1010
InlineJavascriptRequirement: {}
1111

1212
hints:

workflows/conditionals/amplicon/amplicon-1.cwl

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -70,41 +70,29 @@ steps:
7070
out: [ hashsum ]
7171

7272

73-
# << SeqPrep only for paired reads >>
73+
# << SeqPrep (only for paired reads) + gunzip for paired and single>>
7474
overlap_reads:
7575
label: Paired-end overlapping reads are merged
76-
run: ../../../tools/SeqPrep/seqprep.cwl
77-
when: $(inputs.single == undefined)
76+
run: ../../subworkflows/seqprep-subwf.cwl
7877
in:
7978
single: single_reads
8079
forward_reads: forward_reads
8180
reverse_reads: reverse_reads
82-
out: [ merged_reads, forward_unmerged_reads, reverse_unmerged_reads ]
83-
84-
# << unzipping only >>
85-
unzip_reads:
86-
run: ../../../utils/multiple-gunzip.cwl
87-
in:
88-
target_reads:
89-
source:
90-
- overlap_reads/merged_reads
91-
- single_reads
92-
pickValue: first_non_null
93-
reads: { default: true }
94-
out: [ unzipped_merged_reads ]
81+
paired_reads_length_filter: { default: 100 }
82+
out: [ unzipped_single_reads ]
9583

9684
count_submitted_reads:
9785
run: ../../../utils/count_lines/count_lines.cwl
9886
in:
99-
sequences: unzip_reads/unzipped_merged_reads
87+
sequences: overlap_reads/unzipped_single_reads
10088
number: { default: 4 }
10189
out: [ count ]
10290

10391
# << Trim and Reformat >>
10492
trimming:
10593
run: ../../subworkflows/trim_and_reformat_reads.cwl
10694
in:
107-
reads: unzip_reads/unzipped_merged_reads
95+
reads: overlap_reads/unzipped_single_reads
10896
count: count_submitted_reads/count
10997
out: [ trimmed_and_reformatted_reads ]
11098

workflows/conditionals/raw-reads/raw-reads-1.cwl

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -72,33 +72,21 @@ steps:
7272
out: [ hashsum ]
7373

7474

75-
# << SeqPrep only for paired reads >>
75+
# << SeqPrep (only for paired reads) + gunzip for paired and single>>
7676
overlap_reads:
7777
label: Paired-end overlapping reads are merged
78-
run: ../../../tools/SeqPrep/seqprep.cwl
79-
when: $(inputs.single == undefined)
78+
run: ../../subworkflows/seqprep-subwf.cwl
8079
in:
8180
single: single_reads
8281
forward_reads: forward_reads
8382
reverse_reads: reverse_reads
84-
out: [ merged_reads, forward_unmerged_reads, reverse_unmerged_reads ]
85-
86-
# << unzipping only >>
87-
unzip_reads:
88-
run: ../../../utils/multiple-gunzip.cwl
89-
in:
90-
target_reads:
91-
source:
92-
- overlap_reads/merged_reads
93-
- single_reads
94-
pickValue: first_non_null
95-
reads: { default: true }
96-
out: [ unzipped_merged_reads ]
83+
paired_reads_length_filter: { default: 100 }
84+
out: [ unzipped_single_reads ]
9785

9886
count_submitted_reads:
9987
run: ../../../utils/count_lines/count_lines.cwl
10088
in:
101-
sequences: unzip_reads/unzipped_merged_reads
89+
sequences: overlap_reads/unzipped_single_reads
10290
number: { default: 4 }
10391
out: [ count ]
10492

@@ -109,7 +97,7 @@ steps:
10997
less than 15 over a 4 nucleotide wide window are removed)
11098
run: ../../../tools/Trimmomatic/Trimmomatic-v0.36-SE.cwl
11199
in:
112-
reads1: unzip_reads/unzipped_merged_reads
100+
reads1: overlap_reads/unzipped_single_reads
113101
phred: { default: '33' }
114102
leading: { default: 3 }
115103
trailing: { default: 3 }

workflows/subworkflows/chunking-subwf-hmmer.yml

Lines changed: 0 additions & 11 deletions
This file was deleted.

workflows/subworkflows/classify-otu-visualise.yml

Lines changed: 0 additions & 27 deletions
This file was deleted.

0 commit comments

Comments
 (0)