Skip to content

Commit 8b9be0c

Browse files
committed
add cond to suppress wf and remove empty.sh
1 parent 124afdc commit 8b9be0c

File tree

31 files changed

+183743
-160
lines changed

31 files changed

+183743
-160
lines changed

.travis.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ services:
88

99
env:
1010
- TOOL="Validate" CONF_SCRIPT="travis/conformance-tests.sh"
11+
# - TOOL="SUPPRESS" TEST_SUPPRESS_SUBWF="travis/cwltest.sh"
1112

1213
install:
1314
- travis/install-nodejs.sh
@@ -19,3 +20,4 @@ before_script:
1920

2021
script:
2122
- bash ${CONF_SCRIPT}
23+
# - bash ${TEST_SUPPRESS_SUBWF} travis/tests/amplicon-suppress/1.yml

input_examples/amplicon/ERR632171_FASTQ.fasta

Lines changed: 183246 additions & 0 deletions
Large diffs are not rendered by default.

tools/Assembly/antismash/chunking_antismash_with_conditionals/antismash_chunking_subwf.cwl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,14 +135,14 @@ steps:
135135

136136
# gzip embl
137137
gzipped_embl:
138-
run: ../../../../utils/gzip.cwl
138+
run: ../../../../utils/pigz/gzip.cwl
139139
in:
140140
uncompressed_file: unite_embl/result
141141
out: [ compressed_file ]
142142

143143
# gzip gbk
144144
gzipped_gbk:
145-
run: ../../../../utils/gzip.cwl
145+
run: ../../../../utils/pigz/gzip.cwl
146146
in:
147147
uncompressed_file: unite_gbk/result
148148
out: [ compressed_file ]

tools/RNA_prediction/biom-convert/biom-convert.cwl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ baseCommand: [ "biom-convert.sh" ]
2424

2525
inputs:
2626
biom:
27-
type: File
27+
type: File?
2828
format: edam:format_3746 # BIOM
2929
inputBinding:
3030
prefix: --input-fp
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#!/usr/bin/python3
2+
import glob
3+
import argparse
4+
import sys
5+
import os
6+
from Bio import SeqIO
7+
import gzip
8+
import shutil
9+
10+
11+
def get_avg_length(masked_its): # get average length of longest ITS sequences - separated by 'N'
12+
if masked_its is not None:
13+
if os.path.exists(masked_its):
14+
all_lengths = []
15+
with gzip.open(masked_its, 'rt') as unzipped_file:
16+
for record in SeqIO.parse(unzipped_file, 'fasta'):
17+
sequences = [x for x in record.seq.split('N') if x and x != '']
18+
longest_seq = {'num': 0, 'letters': ''}
19+
for seq in sequences:
20+
length = len(seq)
21+
if length > longest_seq['num']:
22+
longest_seq['num'] = length
23+
longest_seq['letters'] = seq
24+
all_lengths.append(longest_seq['num'])
25+
return int(sum(all_lengths) / len(all_lengths))
26+
else:
27+
return 0
28+
else:
29+
return 0
30+
31+
32+
def hits_to_num_ratio(fasta, input_folder): # ratio of mapseq hits to number of total seqs LSU/SSU
33+
rna_sum, rna_num = [0 for _ in range(2)]
34+
rna = os.path.join(input_folder, '*.tsv')
35+
if 'empty' not in os.path.relpath(fasta):
36+
with open(glob.glob(rna)[0], 'r') as rna_hits:
37+
for line in rna_hits:
38+
if not line.startswith('#'):
39+
rna_sum += float(line.split('\t')[1])
40+
rna_num = len([1 for line in gzip.open(fasta, 'rt') if line.startswith('>')])
41+
return float(rna_sum / rna_num)
42+
else:
43+
return 0
44+
45+
46+
def validate_hits(ssu_fasta, lsu_fasta, ssu_folder, lsu_folder, len_avg): # check length and ratio and assign tag
47+
ssu_ratio = hits_to_num_ratio(ssu_fasta, ssu_folder) if ssu_folder is not None else 0
48+
lsu_ratio = hits_to_num_ratio(lsu_fasta, lsu_folder) if lsu_folder is not None else 0
49+
if len_avg > 200:
50+
if ssu_ratio or lsu_ratio > 0.1:
51+
return 'both'
52+
else:
53+
return 'ITS'
54+
elif 120 <= len_avg <= 199:
55+
if ssu_ratio or lsu_ratio > 0.1:
56+
return 'rRNA'
57+
else:
58+
return 'ITS'
59+
else:
60+
return 'rRNA'
61+
62+
63+
def suppress_dir(flag, lsu, ssu, its, its_file, ssu_file, lsu_file):
64+
suppressed_folder = 'suppressed'
65+
os.mkdir('suppressed')
66+
taxonomy_summary = 'taxonomy-summary'
67+
os.mkdir('taxonomy-summary')
68+
69+
its_filename = os.path.basename(its_file) if its is not None else ''
70+
lsu_filename = os.path.basename(lsu_file) if lsu is not None else ''
71+
ssu_filename = os.path.basename(ssu_file) if ssu is not None else ''
72+
73+
# move dir by tag
74+
list_folders, list_files = [[] for _ in range(2)]
75+
addition = ''
76+
for folder, name, cur_file, filename in zip([lsu, ssu, its],
77+
['/LSU', '/SSU', '/its'],
78+
[lsu_file, ssu_file, its_file],
79+
[lsu_filename, ssu_filename, its_filename]):
80+
if folder is not None:
81+
if flag == 'ITS':
82+
if name == '/its':
83+
list_folders.append((folder, taxonomy_summary + name))
84+
list_files.append((cur_file, filename))
85+
else:
86+
list_folders.append((folder, suppressed_folder + name))
87+
list_files.append((cur_file, suppressed_folder))
88+
elif flag == 'rRNA':
89+
if name == '/its':
90+
list_folders.append((folder, suppressed_folder + name))
91+
list_files.append((cur_file, suppressed_folder))
92+
else:
93+
list_folders.append((folder, taxonomy_summary + name))
94+
list_files.append((cur_file, filename))
95+
elif flag == 'both':
96+
list_folders.append((folder, name))
97+
list_files.append((cur_file, filename))
98+
addition = taxonomy_summary
99+
100+
[shutil.copytree(src, addition + dest) for src, dest in list_folders]
101+
[shutil.copy(src, dest) for src, dest in list_files]
102+
103+
104+
if __name__ == '__main__':
105+
parser = argparse.ArgumentParser(description="get average length of ITS sequences and suppress unwanted folders")
106+
parser.add_argument("--lsu-file", dest="lsu_file", help="lsu fasta", required=False, default=None)
107+
parser.add_argument("--ssu-file", dest="ssu_file", help="ssu fasta", required=False, default=None)
108+
parser.add_argument("--its-file", dest="its_file", help="its fasta", required=False, default=None)
109+
parser.add_argument("--lsu-dir", dest="lsu_directory", help="directory in path taxonomy-summary/LSU",
110+
required=False, default=None)
111+
parser.add_argument("--ssu-dir", dest="ssu_directory", help="directory in path taxonomy-summary/SSU",
112+
required=False, default=None)
113+
parser.add_argument("--its-dir", dest="its_directory", help="directory in path taxonomy-summary/its",
114+
required=False, default=None)
115+
116+
117+
if len(sys.argv) < 3:
118+
parser.print_help()
119+
else:
120+
args = parser.parse_args()
121+
avg = get_avg_length(args.its_file)
122+
print('average ITS length is ' + str(avg))
123+
print('suppressing...')
124+
suppress_flag = validate_hits(args.ssu_file, args.lsu_file, args.ssu_directory, args.lsu_directory, avg)
125+
print(suppress_flag)
126+
suppress_dir(suppress_flag, args.lsu_directory, args.ssu_directory, args.its_directory, args.its_file,
127+
args.ssu_file, args.lsu_file)
128+
if len(os.listdir('suppressed')) == 0:
129+
os.rmdir('suppressed')
130+
if len(os.listdir('taxonomy-summary')) == 0:
131+
os.rmdir('taxonomy-summary')

tools/mask-for-ITS/its-length.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env /hps/nobackup2/production/metagenomics/pipeline/tools-v5/miniconda3-4.6.14/bin/python3
1+
#!/usr/bin/python3
22
import glob
33
import argparse
44
import sys
@@ -25,7 +25,7 @@ def get_avg_length(masked_its): # get average length of longest ITS sequences -
2525
else:
2626
return 0
2727

28-
def hits_to_num_ratio(fasta, input_folder): # ratio of mapseq hits to number of total seqs LSU/SSU
28+
def hits_to_num_ratio(fasta, input_folder): # ratio of mapseq hits to number of total seqs LSU/SSU
2929
rna_sum, rna_num = [0 for _ in range(2)]
3030
rna = os.path.join(input_folder, '*.tsv')
3131
if 'empty' not in os.path.relpath(fasta):
@@ -93,5 +93,4 @@ def suppress_dir(flag, lsu, ssu, its, its_file, ssu_file, lsu_file):
9393
validate_hits(args.ssu_file, args.lsu_file, args.ssu_directory, args.lsu_directory, avg)
9494
print('suppressing...')
9595
suppress_flag = validate_hits(args.ssu_file, args.lsu_file, args.ssu_directory, args.lsu_directory, avg)
96-
suppress_dir(suppress_flag, args.lsu_directory, args.ssu_directory, args.its_directory, args.its_file, args.ssu_file, args.lsu_file)
97-
96+
suppress_dir(suppress_flag, args.lsu_directory, args.ssu_directory, args.its_directory, args.its_file, args.ssu_file, args.lsu_file)

tools/mask-for-ITS/suppress_tax.cwl

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,51 +11,49 @@ requirements:
1111

1212
inputs:
1313
ssu_file:
14-
type: File
14+
type: File?
1515
inputBinding:
1616
prefix: --ssu-file
1717
lsu_file:
18-
type: File
18+
type: File?
1919
inputBinding:
2020
prefix: --lsu-file
2121
its_file:
22-
type: File
22+
type: File?
2323
inputBinding:
2424
prefix: --its-file
2525
lsu_dir:
26-
type: Directory
27-
default: "LSU"
26+
type: Directory?
27+
# default: "LSU"
2828
inputBinding:
2929
prefix: --lsu-dir
3030
ssu_dir:
31-
type: Directory
32-
default: "SSU"
31+
type: Directory?
32+
# default: "SSU"
3333
inputBinding:
3434
prefix: --ssu-dir
3535
its_dir:
36-
type: Directory
37-
default: "its"
36+
type: Directory?
37+
# default: "its"
3838
inputBinding:
3939
prefix: --its-dir
4040

4141

42-
baseCommand: [its-length.py]
42+
baseCommand: [ its-length-new.py ]
4343
stdout: ITS_LENGTH
4444

4545
outputs:
46-
stdout: stdout
46+
its_length: stdout
4747
out_tax:
48-
type: Directory
48+
type: Directory?
4949
outputBinding:
5050
glob: "taxonomy-summary"
5151
out_suppress:
52-
type: Directory
52+
type: Directory?
5353
outputBinding:
5454
glob: "suppressed"
55-
out_fastas:
56-
type:
57-
type: array
58-
items: File
55+
out_fastas_tax:
56+
type: File[]?
5957
outputBinding:
6058
glob: "*.fasta.gz"
6159

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
ssu_file:
22
class: File
3-
path: SSU.fasta.gz
3+
path: test-input/SSU/SSU.fasta.gz
44
lsu_file:
55
class: File
6-
path: LSU.fasta.gz
6+
path: test-input/LSU/LSU.fasta.gz
77
its_file:
88
class: File
9-
path: ITS_masked.fasta.gz
9+
path: test-input/its/its.fasta.gz
1010
lsu_dir:
1111
class: Directory
12-
path: taxonomy-summary/LSU
12+
path: test-input/LSU
1313
ssu_dir:
1414
class: Directory
15-
path: taxonomy-summary/SSU
15+
path: test-input/SSU
1616
its_dir:
1717
class: Directory
18-
path: taxonomy-summary/its
18+
path: test-input/its
19+
675 Bytes
Binary file not shown.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Constructed from biom file
2+
# OTU ID LSU_rRNA taxonomy taxid
3+
25676 3.0 sk__Eukaryota 2759
4+
16424 1.0 sk__Eukaryota;k__Fungi 4751
5+
20472 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Leotiomycetes 147548
6+
1400 1.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes 147550
7+
4544 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales 5125
8+
15497 6.0 sk__Eukaryota;k__Fungi;p__Basidiomycota;c__Agaricomycetes 155619
9+
2195 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes 214506
10+
14320 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus 1129544
11+
19603 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus;s__Rhizophagus_intraradices 4876
12+
17712 3.0 sk__Eukaryota;k__Viridiplantae;p__Streptophyta;c__Magnoliopsida 3398

0 commit comments

Comments
 (0)