Skip to content

Commit 8f6aa5e

Browse files
Added gzip support for alignment file.
1 parent 1b8f510 commit 8f6aa5e

File tree

7 files changed

+126
-117
lines changed

7 files changed

+126
-117
lines changed

CAT_pack/about.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
22

33
__author__ = 'F. A. Bastiaan von Meijenfeldt'
4-
__version__ = '5.1'
5-
__date__ = '22 June, 2020'
4+
__version__ = '5.1.1'
5+
__date__ = '23 June, 2020'

CAT_pack/bins.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,12 @@ def parse_arguments():
224224
action=shared.PathAction,
225225
help=('Directory for temporary DIAMOND files (default: directory '
226226
'to which output files are written).'))
227+
specific.add_argument(
228+
'--compress',
229+
dest='compress',
230+
required=False,
231+
action='store_true',
232+
help='Compress DIAMOND alignment file.')
227233
specific.add_argument(
228234
'--top',
229235
dest='top',
@@ -398,7 +404,7 @@ def run():
398404
args.bin_folder,
399405
args.taxonomy_folder,
400406
args.database_folder,
401-
float(args.r),
407+
int(args.r),
402408
float(args.f),
403409
args.log_file))
404410
shared.give_user_feedback(message, args.log_file, args.quiet,
@@ -537,21 +543,8 @@ def run():
537543
contig_names, contig2ORFs, args.log_file, args.quiet)
538544

539545
if 'align' in step_list:
540-
shared.run_diamond(
541-
args.path_to_diamond,
542-
args.diamond_database,
543-
args.proteins_fasta,
544-
args.alignment_file,
545-
args.nproc,
546-
args.sensitive,
547-
args.block_size,
548-
args.index_chunks,
549-
args.tmpdir,
550-
args.top,
551-
args.log_file,
552-
args.quiet,
553-
args.verbose)
554-
546+
shared.run_diamond(args)
547+
555548
(ORF2hits,
556549
all_hits) = shared.parse_tabular_alignment(
557550
args.alignment_file,

CAT_pack/contigs.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,12 @@ def parse_arguments():
213213
action=shared.PathAction,
214214
help=('Directory for temporary DIAMOND files (default: directory '
215215
'to which output files are written).'))
216+
specific.add_argument(
217+
'--compress',
218+
dest='compress',
219+
required=False,
220+
action='store_true',
221+
help='Compress DIAMOND alignment file.')
216222
specific.add_argument(
217223
'--top',
218224
dest='top',
@@ -311,7 +317,7 @@ def run():
311317
args.contigs_fasta,
312318
args.taxonomy_folder,
313319
args.database_folder,
314-
float(args.r),
320+
int(args.r),
315321
float(args.f),
316322
args.log_file))
317323
shared.give_user_feedback(message, args.log_file, args.quiet,
@@ -431,21 +437,8 @@ def run():
431437
contig_names, contig2ORFs, args.log_file, args.quiet)
432438

433439
if 'align' in step_list:
434-
shared.run_diamond(
435-
args.path_to_diamond,
436-
args.diamond_database,
437-
args.proteins_fasta,
438-
args.alignment_file,
439-
args.nproc,
440-
args.sensitive,
441-
args.block_size,
442-
args.index_chunks,
443-
args.tmpdir,
444-
args.top,
445-
args.log_file,
446-
args.quiet,
447-
args.verbose)
448-
440+
shared.run_diamond(args)
441+
449442
(ORF2hits,
450443
all_hits) = shared.parse_tabular_alignment(
451444
args.alignment_file,

CAT_pack/shared.py

Lines changed: 79 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import argparse
44
import datetime
55
import decimal
6+
import gzip
67
import os
78
import subprocess
89
import sys
@@ -216,24 +217,16 @@ def run_prodigal(
216217
return
217218

218219

219-
def run_diamond(
220-
path_to_diamond,
221-
diamond_database,
222-
proteins_fasta,
223-
alignment_file,
224-
nproc,
225-
sensitive,
226-
block_size,
227-
index_chunks,
228-
tmpdir,
229-
top,
230-
log_file,
231-
quiet,
232-
verbose):
233-
if not sensitive:
220+
def run_diamond(args):
221+
if args.sensitive:
222+
mode = 'sensitive'
223+
else:
234224
mode = 'fast'
225+
226+
if args.compress:
227+
compression = '1'
235228
else:
236-
mode = 'sensitive'
229+
compression = '0'
237230

238231
message = (
239232
'Homology search with DIAMOND is starting. Please be patient. Do '
@@ -246,47 +239,54 @@ def run_diamond(
246239
'\t\t\t\tblock-size (billions of letters): {4}\n'
247240
'\t\t\t\tindex-chunks: {5}\n'
248241
'\t\t\t\ttmpdir: {6}\n'
249-
'\t\t\t\ttop: {7}'.format(
250-
proteins_fasta,
251-
diamond_database,
242+
'\t\t\t\tcompress: {7}\n'
243+
'\t\t\t\ttop: {8}'.format(
244+
args.proteins_fasta,
245+
args.diamond_database,
252246
mode,
253-
nproc,
254-
block_size,
255-
index_chunks,
256-
tmpdir,
257-
top))
258-
give_user_feedback(message, log_file, quiet)
247+
args.nproc,
248+
args.block_size,
249+
args.index_chunks,
250+
args.tmpdir,
251+
compression,
252+
args.top))
253+
give_user_feedback(message, args.log_file, args.quiet)
259254

260255
try:
261256
command = [
262-
path_to_diamond,
257+
args.path_to_diamond,
263258
'blastp',
264-
'-d', diamond_database,
265-
'-q', proteins_fasta,
266-
'--top', str(top),
259+
'-d', args.diamond_database,
260+
'-q', args.proteins_fasta,
261+
'--top', str(args.top),
267262
'--matrix', 'BLOSUM62',
268263
'--evalue', '0.001',
269-
'-o', alignment_file,
270-
'-p', str(nproc),
271-
'--block-size', str(block_size),
272-
'--index-chunks', str(index_chunks),
273-
'--tmpdir', tmpdir]
274-
275-
if not verbose:
264+
'-o', args.alignment_file,
265+
'-p', str(args.nproc),
266+
'--block-size', str(args.block_size),
267+
'--index-chunks', str(args.index_chunks),
268+
'--tmpdir', args.tmpdir,
269+
'--compress', compression]
270+
271+
if not args.verbose:
276272
command += ['--quiet']
277273

278-
if sensitive:
274+
if args.sensitive:
279275
command += ['--sensitive']
280276

281277
subprocess.check_call(command)
282278
except:
283279
message = 'DIAMOND finished abnormally.'
284-
give_user_feedback(message, log_file, quiet, error=True)
280+
give_user_feedback(message, args.log_file, args.quiet, error=True)
285281

286282
sys.exit(1)
287283

288-
message = 'Homology search done! File {0} created.'.format(alignment_file)
289-
give_user_feedback(message, log_file, quiet)
284+
if args.compress:
285+
setattr(args, 'alignment_file', '{0}.gz'.format(args.alignment_file))
286+
287+
message = 'Homology search done! File {0} created.'.format(
288+
args.alignment_file)
289+
give_user_feedback(message, args.log_file, args.quiet)
290290

291291
return
292292

@@ -343,38 +343,50 @@ def parse_tabular_alignment(
343343
message = 'Parsing alignment file {0}.'.format(alignment_file)
344344
give_user_feedback(message, log_file, quiet)
345345

346+
compressed = False
347+
if alignment_file.endswith('.gz'):
348+
compressed = True
349+
350+
f1 = gzip.open(alignment_file, 'rb')
351+
else:
352+
f1 = open(alignment_file, 'r')
353+
346354
ORF2hits = {}
347355
all_hits = set()
348356

349357
ORF = 'first ORF'
350358
ORF_done = False
351-
with open(alignment_file, 'r') as f1:
352-
for line in f1:
353-
if line.startswith(ORF) and ORF_done == True:
354-
# The ORF has already surpassed its minimum allowed bit-score.
355-
continue
356-
357-
line = line.rstrip().split('\t')
358-
359-
if not line[0] == ORF:
360-
# A new ORF is reached.
361-
ORF = line[0]
362-
best_bitscore = decimal.Decimal(line[11])
363-
ORF2hits[ORF] = []
364-
365-
ORF_done = False
366-
367-
bitscore = decimal.Decimal(line[11])
368-
369-
if bitscore >= one_minus_r * best_bitscore:
370-
# The hit has a high enough bit-score to be included.
371-
hit = line[1]
372-
373-
ORF2hits[ORF].append((hit, bitscore),)
374-
all_hits.add(hit)
375-
else:
376-
# The hit is not included because its bit-score is too low.
377-
ORF_done = True
359+
for line in f1:
360+
if compressed:
361+
line = line.decode('utf-8')
362+
363+
if line.startswith(ORF) and ORF_done == True:
364+
# The ORF has already surpassed its minimum allowed bit-score.
365+
continue
366+
367+
line = line.rstrip().split('\t')
368+
369+
if not line[0] == ORF:
370+
# A new ORF is reached.
371+
ORF = line[0]
372+
best_bitscore = decimal.Decimal(line[11])
373+
ORF2hits[ORF] = []
374+
375+
ORF_done = False
376+
377+
bitscore = decimal.Decimal(line[11])
378+
379+
if bitscore >= one_minus_r * best_bitscore:
380+
# The hit has a high enough bit-score to be included.
381+
hit = line[1]
382+
383+
ORF2hits[ORF].append((hit, bitscore),)
384+
all_hits.add(hit)
385+
else:
386+
# The hit is not included because its bit-score is too low.
387+
ORF_done = True
388+
389+
f1.close()
378390

379391
return (ORF2hits, all_hits)
380392

CAT_pack/single_bin.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,12 @@ def parse_arguments():
212212
action=shared.PathAction,
213213
help=('Directory for temporary DIAMOND files (default: directory '
214214
'to which output files are written).'))
215+
specific.add_argument(
216+
'--compress',
217+
dest='compress',
218+
required=False,
219+
action='store_true',
220+
help='Compress DIAMOND alignment file.')
215221
specific.add_argument(
216222
'--top',
217223
dest='top',
@@ -310,7 +316,7 @@ def run():
310316
args.bin_fasta,
311317
args.taxonomy_folder,
312318
args.database_folder,
313-
float(args.r),
319+
int(args.r),
314320
float(args.f),
315321
args.log_file))
316322
shared.give_user_feedback(message, args.log_file, args.quiet,
@@ -399,6 +405,10 @@ def run():
399405
check.check_fasta(
400406
args.proteins_fasta, args.log_file, args.quiet))
401407

408+
if 'align' in step_list:
409+
errors.append(
410+
check.check_top(args.top, args.r, args.log_file, args.quiet))
411+
402412
# Print all variables.
403413
shared.print_variables(args, step_list)
404414

@@ -429,20 +439,7 @@ def run():
429439
contig_names, contig2ORFs, args.log_file, args.quiet)
430440

431441
if 'align' in step_list:
432-
shared.run_diamond(
433-
args.path_to_diamond,
434-
args.diamond_database,
435-
args.proteins_fasta,
436-
args.alignment_file,
437-
args.nproc,
438-
args.sensitive,
439-
args.block_size,
440-
args.index_chunks,
441-
args.tmpdir,
442-
args.top,
443-
args.log_file,
444-
args.quiet,
445-
args.verbose)
442+
shared.run_diamond(args)
446443

447444
(ORF2hits,
448445
all_hits) = shared.parse_tabular_alignment(

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## 5.1.1
4+
CAT and BAT can now compress the DIAMOND alignment file, and import gzip compressed alignment files.
5+
36
## 5.1
47
The code has been rewritten to prepare for future extensions. We have also added the `--verbose` flag.
58

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ CAT and BAT have been thoroughly tested on Linux systems, and should run on macO
3131
No installation is required. You can run CAT and BAT by supplying the absolute path:
3232

3333
```
34-
$ CAT_pack/CAT --help
34+
$ ./CAT_pack/CAT --help
3535
```
3636

3737
Alternatively, if you add the files in the CAT\_pack directory to your `$PATH` variable, you can run CAT and BAT from anywhere:
@@ -86,6 +86,17 @@ The taxonomy folder and database folder created by CAT prepare are needed in sub
8686

8787
To run CAT on a contig set, each header in the contig fasta file (the part after `>` and before the first space) needs to be unique. To run BAT on set of MAGs, each header in a MAG needs to be unique within that MAG. If you are unsure if this is the case, you can just run CAT or BAT, as the appropriate error messages are generated if formatting is incorrect.
8888

89+
### Getting help.
90+
If you are unsure what options a program has, you can always add `--help` to a command. This is a great way to get you started with CAT and BAT.
91+
92+
```
93+
$ CAT --help
94+
95+
$ CAT contigs --help
96+
97+
$ CAT summarise --help
98+
```
99+
89100
## Usage
90101
After you have got the database files on your system, you can run CAT to annotate your contig set:
91102

0 commit comments

Comments
 (0)