2323
2424
2525class Annotation :
26-
2726 @classmethod
2827 def merge (cls , annotations ):
2928 """Merge the annotations.
@@ -43,37 +42,41 @@ def merge(cls, annotations):
4342 @classmethod
4443 def clean_seq_name (cls , name ):
4544 # prodigal clean up
46- prodigal_match = re .match ( ' _\d+$' , name )
45+ prodigal_match = re .search ( " _\d+$" , name )
4746 if prodigal_match :
48- return re .sub (r' _\d+$' , '' , name )
47+ return re .sub (r" _\d+$" , "" , name )
4948 # fgs clean_up
50- fgs_match = re .match ( ' ^(?P<contig>.+?)_\d+_\d+\_.' , name )
49+ fgs_match = re .search ( " ^(?P<contig>.+?)_\d+_\d+\_." , name )
5150 if fgs_match :
5251 groups = fgs_match .groupdict ()
53- return groups [' contig' ]
54-
52+ return groups [" contig" ]
53+
5554 # no fgs or prodigal
56- return name
55+ raise Exception ( "Error parsing the header:" + name )
5756
5857 def _split_line (self , line ):
59- return line .replace (' \n ' , ' ' ).replace (' \r ' , '' ).split (' \t ' )
58+ return line .replace (" \n " , " " ).replace (" \r " , "" ).split (" \t " )
6059
6160 def _get_value (self , value , split = True ):
6261 if split :
63- return list (filter (None , value .strip ().split (',' )))
62+ return list (filter (None , value .strip ().split ("," )))
6463 else :
6564 return [value .strip ()] if value else []
6665
6766 def get (self ):
6867 """
6968 Get the annotation in an array with [Key,Value] structure
7069 """
71- return [[a , v ] for a , v in sorted (self .__dict__ .items ()) if a != 'query_name' and len (v )]
70+ return [
71+ [a , v ]
72+ for a , v in sorted (self .__dict__ .items ())
73+ if a != "query_name" and len (v )
74+ ]
7275
7376
7477class EggResult (Annotation ):
75- """EggNOG tsv result row.
76- """
78+ """EggNOG tsv result row."""
79+
7780 def __init__ (self , line ):
7881 """Lines parsed according to the documentation
7982 https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2#v200
@@ -96,57 +99,48 @@ def __init__(self, line):
9699
97100
98101class InterProResult (Annotation ):
99- """InterPro scan result row.
100- """
102+ """InterPro scan result row."""
103+
101104 def __init__ (self , line ):
102105 columns = self ._split_line (line )
103106 self .query_name = columns [0 ].strip ()
104107 pfam = columns [4 ]
105- if re .match (' PF\d+' , pfam ): # noqa
108+ if re .match (" PF\d+" , pfam ): # noqa
106109 self .pfam = self ._get_value (pfam )
107110 if len (columns ) > 11 :
108111 self .interpro = self ._get_value (columns [11 ])
109112
110113
111- def parse_fasta_header_mags (header ):
112- match = re .match (
113- '^\>(?P<contig>.+\-\-contig:-.*)\s\#\s(?P<start>\d+)\s\#\s(?P<end>\d+)\s\#\s(?P<strand>\-1|1)\s.*$' , header ) # noqa
114- if match :
115- groups = match .groupdict ()
116- return groups ['contig' ], groups ['start' ], groups ['end' ], groups ['strand' ]
117- return None , None , None , None
118-
119-
120114def parse_fasta_header (header ):
121- # FIXME: add sanity check
122- # FIXME: what should we do with cases like :
123- # ERZ477576.1085103-NODE-1085103-length-126-cov-0.633803_1_126_+
124-
125- # Prodigal header example: >NODE-3-length-2984-cov-4.247866_3 # 1439 # 1894 # 1 # ID=3_3;partial=00;start_type=TTG;rbs_motif=TAA;rbs_spacer=8bp;gc_cont=0.340
126- prodigal_match = re .match ('^>(?P<contig>.+?)\s#\s(?P<start>\d+)\s#\s(?P<end>\d+)\s#\s(?P<strand>.+?)\s#' , header )
115+ """Parse the hader header, only 2 supported formats are prodigal and FGS."""
116+
117+ # Prodigal header example: >NODE-3-length-2984-cov-4.247866_3 # 1439 # 1894 # 1 # ID=3_3;partial=00;start_type=TTG;rbs_motif=TAA;rbs_spacer=8bp;gc_cont=0.340
118+ prodigal_match = re .match (
119+ "^>(?P<contig>.+?)\s#\s(?P<start>\d+)\s#\s(?P<end>\d+)\s#\s(?P<strand>.+?)\s#" ,
120+ header ,
121+ )
127122 if prodigal_match :
128123 groups = prodigal_match .groupdict ()
129- return groups [' contig' ], groups [' start' ], groups [' end' ], groups [' strand' ]
130-
124+ return groups [" contig" ], groups [" start" ], groups [" end" ], groups [" strand" ]
125+
131126 # FGS header example: >ERZ1759872.3-contig-100_3188_4599_-
132- fgs_match = re .match (' ^>.+?_(?P<start>\d+)_(?P<end>\d+)_(?P<strand>.)' , header )
127+ fgs_match = re .match (" ^>.+?_(?P<start>\d+)_(?P<end>\d+)_(?P<strand>.)" , header )
133128 if fgs_match :
134129 groups = fgs_match .groupdict ()
135- strand = '1'
136- if groups [' strand' ] == '-' :
130+ strand = "1"
131+ if groups [" strand" ] == "-" :
137132 strand = "-1"
138- return header .rstrip ().replace (">" , "" ), groups [' start' ], groups [' end' ], strand
139-
133+ return header .rstrip ().replace (">" , "" ), groups [" start" ], groups [" end" ], strand
134+
140135 # unable to parse fasta header
141- return None , None , None , None
142-
136+ raise Exception ( "Unable to parse fasta header " + header )
137+
143138
144139def load_annotation (file , klass , annotations ):
145- """Load the annotations of a TSV by `query_name` (contig name at the moment)
146- """
147- with open (file , 'rt' ) as ann_file :
140+ """Load the annotations of a TSV by `query_name` (contig name at the moment)"""
141+ with open (file , "rt" ) as ann_file :
148142 for line in ann_file :
149- if '#' in line :
143+ if "#" in line :
150144 continue
151145 parsed_line = klass (line )
152146 if parsed_line .query_name in annotations :
@@ -187,86 +181,90 @@ def build_gff(annotations, faa):
187181 ctg123 . exon 5000 5500 . + . ID=exon00004;Parent=mrna0001
188182 ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mrna0001
189183 """
190- with open (faa , 'rt' ) as faa_file :
184+ with open (faa , "rt" ) as faa_file :
191185 for line in faa_file :
192- if '>' not in line :
186+ if ">" not in line :
193187 continue
194188
195189 # each fasta is suffixed on the annotated faa if a prefix _INT (_1 .. _n)
196190 contig_name , start , end , strand = parse_fasta_header (line )
197191 if None in (contig_name , start , end , strand ):
198- print (line , end = '' , file = sys .stderr )
192+ print (
193+ "It was not possible to parse the " + line , end = "" , file = sys .stderr
194+ )
199195 continue
200196
201197 clean_name = Annotation .clean_seq_name (contig_name )
202198
203- row_annotations = Annotation .merge ([ann .get () for ann in annotations .get (contig_name , [])])
204-
205- ann_string = ';' .join (['{}={}' .format (k , ',' .join (v ).strip ()) for k , v in row_annotations .items ()])
199+ row_annotations = Annotation .merge (
200+ [ann .get () for ann in annotations .get (contig_name , [])]
201+ )
202+
203+ ann_string = ";" .join (
204+ [
205+ "{}={}" .format (k , "," .join (v ).strip ())
206+ for k , v in row_annotations .items ()
207+ ]
208+ )
206209
207- eggNOGScore = '' .join (row_annotations .get (' eggNOG_score' , []))
210+ eggNOGScore = "" .join (row_annotations .get (" eggNOG_score" , []))
208211
209212 if len (ann_string ):
210213 yield [
211214 clean_name ,
212- ' eggNOG-v2' ,
213- ' CDS' ,
215+ " eggNOG-v2" ,
216+ " CDS" ,
214217 start ,
215218 end ,
216- eggNOGScore or '.' ,
217- '+' if strand == '1' else '-' ,
218- '.' ,
219- ' ID=' + clean_name + ';' + ann_string
219+ eggNOGScore or "." ,
220+ "+" if strand == "1" else "-" ,
221+ "." ,
222+ " ID=" + clean_name + ";" + ann_string ,
220223 ]
221224
222- def error_exit (gff_file ):
223- if os .path .exists (gff_file ):
224- os .remove (gff_file )
225- open ("no_antismash" , "w" ).close ()
226- sys .exit (0 )
227225
228-
229-
230- if __name__ == '__main__' :
226+ if __name__ == "__main__" :
231227 parser = argparse .ArgumentParser (
232- description = 'Build an assembly GFF file (sorted and indexed using samtools and tabix)' )
228+ description = "Build an assembly GFF file (sorted and indexed using samtools and tabix)"
229+ )
233230 parser .add_argument (
234- '-e' , dest = 'egg' , help = 'EggNOG tsv results. eggNOG version 2 required.' , required = False )
231+ "-e" ,
232+ dest = "egg" ,
233+ help = "EggNOG tsv results. eggNOG version 2 required." ,
234+ required = True ,
235+ )
235236 parser .add_argument (
236- '-i' , dest = 'interpro' , help = 'InterProScan tsv results' , required = False )
237- # FIXME: Are we going to use this?
238- # parser.add_argument(
239- # '-a', dest='antismash', help='antiSMASH tsv results (geneclusters.txt)', required=False)
240- # FIXME: Are we going to use this?
241- # parser.add_argument(
242- # '-k', dest='keggmodule', help='KEGG Modules per contig annotation', required=False)
237+ "-i" , dest = "interpro" , help = "InterProScan tsv results" , required = True
238+ )
243239 parser .add_argument (
244- '-f' , dest = ' faa' , help = ' FASTA with the CDS annotated (faa)' , required = True )
245- parser . add_argument (
246- '-o' , dest = ' out' , help = ' Ouput GFF file name' , required = True )
240+ "-f" , dest = " faa" , help = " FASTA with the CDS annotated (faa)" , required = True
241+ )
242+ parser . add_argument ( "-o" , dest = " out" , help = " Ouput GFF file name" , required = True )
247243 args = parser .parse_args ()
248244
249245 annotations = {}
250246 load_annotation (args .egg , EggResult , annotations )
251247 load_annotation (args .interpro , InterProResult , annotations )
252248
253249 if len (annotations ) < 1 :
254- print ("No annotations loaded, aborting" )
255- error_exit (args .out )
250+ raise Exception ("No annotations loaded, aborting" )
256251
257252 records = 0
258- with open (args .out , 'w' , buffering = 1 ) as out_handle :
259- print (' ##gff-version 3' , file = out_handle )
253+ with open (args .out , "w" , buffering = 1 ) as out_handle :
254+ print (" ##gff-version 3" , file = out_handle )
260255 for row in build_gff (annotations , args .faa ):
261- print (' \t ' .join (row ), file = out_handle )
256+ print (" \t " .join (row ), file = out_handle )
262257 records += 1
258+
263259 if records == 0 :
264- print ("No annotations in GFF, aborting" )
265- error_exit (args .out )
266-
267- print ('Sorting...' )
268- subprocess .call ('(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)' .format (args .out ) +
269- ' | bgzip > {0}.bgz' .format (args .out ), shell = True )
270- print ('Building the index...' )
271- subprocess .call (['tabix' , '-p' , 'gff' , '{}.bgz' .format (args .out )])
272- print ('Bye' )
260+ raise Exception ("No annotations in GFF, aborting" )
261+
262+ print ("Sorting..." )
263+ subprocess .call (
264+ '(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)' .format (args .out )
265+ + " | bgzip > {0}.bgz" .format (args .out ),
266+ shell = True ,
267+ )
268+ print ("Building the index..." )
269+ subprocess .call (["tabix" , "-p" , "gff" , "{}.bgz" .format (args .out )])
270+ print ("Bye" )
0 commit comments