Skip to content

Commit 3dca9b4

Browse files
authored
Merge pull request #46 from EBI-Metagenomics/hotfix/assembly_gff_builder
Several build_assembly_gff fixes
2 parents b397b4c + e88146b commit 3dca9b4

File tree

2 files changed

+89
-91
lines changed

2 files changed

+89
-91
lines changed

collect_scripts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,5 +86,5 @@
8686

8787
shutil.copy(file_path, dest)
8888
print(f"Script {file_path} copied to {dest}")
89-
os.chmod(dest, S_IREAD | S_IRGRP | S_IWUSR | S_IRGRP | S_IEXEC)
89+
os.chmod(dest, S_IREAD | S_IRGRP | S_IWUSR | S_IRGRP | S_IEXEC | S_IXGRP)
9090
print(f"- made {dest} as writable")

tools/Assembly/GFF/build_assembly_gff.py

Lines changed: 88 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424

2525
class Annotation:
26-
2726
@classmethod
2827
def merge(cls, annotations):
2928
"""Merge the annotations.
@@ -43,37 +42,41 @@ def merge(cls, annotations):
4342
@classmethod
4443
def clean_seq_name(cls, name):
4544
# prodigal clean up
46-
prodigal_match = re.match('_\d+$', name)
45+
prodigal_match = re.search("_\d+$", name)
4746
if prodigal_match:
48-
return re.sub(r'_\d+$', '', name)
47+
return re.sub(r"_\d+$", "", name)
4948
# fgs clean_up
50-
fgs_match = re.match('^(?P<contig>.+?)_\d+_\d+\_.', name)
49+
fgs_match = re.search("^(?P<contig>.+?)_\d+_\d+\_.", name)
5150
if fgs_match:
5251
groups = fgs_match.groupdict()
53-
return groups['contig']
54-
52+
return groups["contig"]
53+
5554
# no fgs or prodigal
56-
return name
55+
raise Exception("Error parsing the header:" + name)
5756

5857
def _split_line(self, line):
59-
return line.replace('\n', ' ').replace('\r', '').split('\t')
58+
return line.replace("\n", " ").replace("\r", "").split("\t")
6059

6160
def _get_value(self, value, split=True):
6261
if split:
63-
return list(filter(None, value.strip().split(',')))
62+
return list(filter(None, value.strip().split(",")))
6463
else:
6564
return [value.strip()] if value else []
6665

6766
def get(self):
6867
"""
6968
Get the annotation in an array with [Key,Value] structure
7069
"""
71-
return [[a, v] for a, v in sorted(self.__dict__.items()) if a != 'query_name' and len(v)]
70+
return [
71+
[a, v]
72+
for a, v in sorted(self.__dict__.items())
73+
if a != "query_name" and len(v)
74+
]
7275

7376

7477
class EggResult(Annotation):
75-
"""EggNOG tsv result row.
76-
"""
78+
"""EggNOG tsv result row."""
79+
7780
def __init__(self, line):
7881
"""Lines parsed according to the documentation
7982
https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2#v200
@@ -96,57 +99,48 @@ def __init__(self, line):
9699

97100

98101
class InterProResult(Annotation):
99-
"""InterPro scan result row.
100-
"""
102+
"""InterPro scan result row."""
103+
101104
def __init__(self, line):
102105
columns = self._split_line(line)
103106
self.query_name = columns[0].strip()
104107
pfam = columns[4]
105-
if re.match('PF\d+', pfam): # noqa
108+
if re.match("PF\d+", pfam): # noqa
106109
self.pfam = self._get_value(pfam)
107110
if len(columns) > 11:
108111
self.interpro = self._get_value(columns[11])
109112

110113

111-
def parse_fasta_header_mags(header):
112-
match = re.match(
113-
'^\>(?P<contig>.+\-\-contig:-.*)\s\#\s(?P<start>\d+)\s\#\s(?P<end>\d+)\s\#\s(?P<strand>\-1|1)\s.*$', header) # noqa
114-
if match:
115-
groups = match.groupdict()
116-
return groups['contig'], groups['start'], groups['end'], groups['strand']
117-
return None, None, None, None
118-
119-
120114
def parse_fasta_header(header):
121-
# FIXME: add sanity check
122-
# FIXME: what should we do with cases like :
123-
# ERZ477576.1085103-NODE-1085103-length-126-cov-0.633803_1_126_+
124-
125-
# Prodigal header example: >NODE-3-length-2984-cov-4.247866_3 # 1439 # 1894 # 1 # ID=3_3;partial=00;start_type=TTG;rbs_motif=TAA;rbs_spacer=8bp;gc_cont=0.340
126-
prodigal_match = re.match('^>(?P<contig>.+?)\s#\s(?P<start>\d+)\s#\s(?P<end>\d+)\s#\s(?P<strand>.+?)\s#', header)
115+
"""Parse the hader header, only 2 supported formats are prodigal and FGS."""
116+
117+
# Prodigal header example: >NODE-3-length-2984-cov-4.247866_3 # 1439 # 1894 # 1 # ID=3_3;partial=00;start_type=TTG;rbs_motif=TAA;rbs_spacer=8bp;gc_cont=0.340
118+
prodigal_match = re.match(
119+
"^>(?P<contig>.+?)\s#\s(?P<start>\d+)\s#\s(?P<end>\d+)\s#\s(?P<strand>.+?)\s#",
120+
header,
121+
)
127122
if prodigal_match:
128123
groups = prodigal_match.groupdict()
129-
return groups['contig'], groups['start'], groups['end'], groups['strand']
130-
124+
return groups["contig"], groups["start"], groups["end"], groups["strand"]
125+
131126
# FGS header example: >ERZ1759872.3-contig-100_3188_4599_-
132-
fgs_match = re.match('^>.+?_(?P<start>\d+)_(?P<end>\d+)_(?P<strand>.)', header)
127+
fgs_match = re.match("^>.+?_(?P<start>\d+)_(?P<end>\d+)_(?P<strand>.)", header)
133128
if fgs_match:
134129
groups = fgs_match.groupdict()
135-
strand = '1'
136-
if groups['strand'] == '-':
130+
strand = "1"
131+
if groups["strand"] == "-":
137132
strand = "-1"
138-
return header.rstrip().replace(">", ""), groups['start'], groups['end'], strand
139-
133+
return header.rstrip().replace(">", ""), groups["start"], groups["end"], strand
134+
140135
# unable to parse fasta header
141-
return None, None, None, None
142-
136+
raise Exception("Unable to parse fasta header " + header)
137+
143138

144139
def load_annotation(file, klass, annotations):
145-
"""Load the annotations of a TSV by `query_name` (contig name at the moment)
146-
"""
147-
with open(file, 'rt') as ann_file:
140+
"""Load the annotations of a TSV by `query_name` (contig name at the moment)"""
141+
with open(file, "rt") as ann_file:
148142
for line in ann_file:
149-
if '#' in line:
143+
if "#" in line:
150144
continue
151145
parsed_line = klass(line)
152146
if parsed_line.query_name in annotations:
@@ -187,86 +181,90 @@ def build_gff(annotations, faa):
187181
ctg123 . exon 5000 5500 . + . ID=exon00004;Parent=mrna0001
188182
ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mrna0001
189183
"""
190-
with open(faa, 'rt') as faa_file:
184+
with open(faa, "rt") as faa_file:
191185
for line in faa_file:
192-
if '>' not in line:
186+
if ">" not in line:
193187
continue
194188

195189
# each fasta is suffixed on the annotated faa if a prefix _INT (_1 .. _n)
196190
contig_name, start, end, strand = parse_fasta_header(line)
197191
if None in (contig_name, start, end, strand):
198-
print(line, end='', file=sys.stderr)
192+
print(
193+
"It was not possible to parse the " + line, end="", file=sys.stderr
194+
)
199195
continue
200196

201197
clean_name = Annotation.clean_seq_name(contig_name)
202198

203-
row_annotations = Annotation.merge([ann.get() for ann in annotations.get(contig_name, [])])
204-
205-
ann_string = ';'.join(['{}={}'.format(k, ','.join(v).strip()) for k, v in row_annotations.items()])
199+
row_annotations = Annotation.merge(
200+
[ann.get() for ann in annotations.get(contig_name, [])]
201+
)
202+
203+
ann_string = ";".join(
204+
[
205+
"{}={}".format(k, ",".join(v).strip())
206+
for k, v in row_annotations.items()
207+
]
208+
)
206209

207-
eggNOGScore = ''.join(row_annotations.get('eggNOG_score', []))
210+
eggNOGScore = "".join(row_annotations.get("eggNOG_score", []))
208211

209212
if len(ann_string):
210213
yield [
211214
clean_name,
212-
'eggNOG-v2',
213-
'CDS',
215+
"eggNOG-v2",
216+
"CDS",
214217
start,
215218
end,
216-
eggNOGScore or '.',
217-
'+' if strand == '1' else '-',
218-
'.',
219-
'ID=' + clean_name + ';' + ann_string
219+
eggNOGScore or ".",
220+
"+" if strand == "1" else "-",
221+
".",
222+
"ID=" + clean_name + ";" + ann_string,
220223
]
221224

222-
def error_exit(gff_file):
223-
if os.path.exists(gff_file):
224-
os.remove(gff_file)
225-
open("no_antismash", "w").close()
226-
sys.exit(0)
227225

228-
229-
230-
if __name__ == '__main__':
226+
if __name__ == "__main__":
231227
parser = argparse.ArgumentParser(
232-
description='Build an assembly GFF file (sorted and indexed using samtools and tabix)')
228+
description="Build an assembly GFF file (sorted and indexed using samtools and tabix)"
229+
)
233230
parser.add_argument(
234-
'-e', dest='egg', help='EggNOG tsv results. eggNOG version 2 required.', required=False)
231+
"-e",
232+
dest="egg",
233+
help="EggNOG tsv results. eggNOG version 2 required.",
234+
required=True,
235+
)
235236
parser.add_argument(
236-
'-i', dest='interpro', help='InterProScan tsv results', required=False)
237-
# FIXME: Are we going to use this?
238-
# parser.add_argument(
239-
# '-a', dest='antismash', help='antiSMASH tsv results (geneclusters.txt)', required=False)
240-
# FIXME: Are we going to use this?
241-
# parser.add_argument(
242-
# '-k', dest='keggmodule', help='KEGG Modules per contig annotation', required=False)
237+
"-i", dest="interpro", help="InterProScan tsv results", required=True
238+
)
243239
parser.add_argument(
244-
'-f', dest='faa', help='FASTA with the CDS annotated (faa)', required=True)
245-
parser.add_argument(
246-
'-o', dest='out', help='Ouput GFF file name', required=True)
240+
"-f", dest="faa", help="FASTA with the CDS annotated (faa)", required=True
241+
)
242+
parser.add_argument("-o", dest="out", help="Ouput GFF file name", required=True)
247243
args = parser.parse_args()
248244

249245
annotations = {}
250246
load_annotation(args.egg, EggResult, annotations)
251247
load_annotation(args.interpro, InterProResult, annotations)
252248

253249
if len(annotations) < 1:
254-
print("No annotations loaded, aborting")
255-
error_exit(args.out)
250+
raise Exception("No annotations loaded, aborting")
256251

257252
records = 0
258-
with open(args.out, 'w', buffering=1) as out_handle:
259-
print('##gff-version 3', file=out_handle)
253+
with open(args.out, "w", buffering=1) as out_handle:
254+
print("##gff-version 3", file=out_handle)
260255
for row in build_gff(annotations, args.faa):
261-
print('\t'.join(row), file=out_handle)
256+
print("\t".join(row), file=out_handle)
262257
records += 1
258+
263259
if records == 0:
264-
print("No annotations in GFF, aborting")
265-
error_exit(args.out)
266-
267-
print('Sorting...')
268-
subprocess.call('(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)'.format(args.out) +
269-
' | bgzip > {0}.bgz'.format(args.out), shell=True)
270-
print('Building the index...')
271-
subprocess.call(['tabix', '-p', 'gff', '{}.bgz'.format(args.out)])
272-
print('Bye')
260+
raise Exception("No annotations in GFF, aborting")
261+
262+
print("Sorting...")
263+
subprocess.call(
264+
'(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)'.format(args.out)
265+
+ " | bgzip > {0}.bgz".format(args.out),
266+
shell=True,
267+
)
268+
print("Building the index...")
269+
subprocess.call(["tabix", "-p", "gff", "{}.bgz".format(args.out)])
270+
print("Bye")

0 commit comments

Comments
 (0)