@@ -1333,7 +1333,27 @@ def gb2smurf(input, prot_out, smurf_out):
1333
1333
smurf .write ("%s\t %s\t %s\t %s\t %s\n " % (locus_tag , name .lstrip ("0" ), int (mystart ), int (myend ), product_name ))
1334
1334
else :
1335
1335
smurf .write ("%s\t %s\t %s\t %s\t %s\n " % (locus_tag , name .lstrip ("0" ), int (myend ), int (mystart ), product_name ))
1336
-
1336
+
1337
+ def GAGprotClean (input , output ):
1338
+ '''
1339
+ gag.py v1 had headers like:
1340
+ >>evm.model.Contig100.1 protein
1341
+ gag.py v2 has headers like:
1342
+ >protein|evm.model.scaffold_1.169 ID=evm.model.scaffold_1.169|Parent=evm.TU.scaffold_1.169|Name=EVM%20prediction%20scaffold_1.169
1343
+ '''
1344
+ with open (output , 'w' ) as outfile :
1345
+ with open (input , 'ru' ) as infile :
1346
+ for rec in SeqIO .parse (infile , 'fasta' ):
1347
+ if rec .id .startswith ('protein|' ):
1348
+ ID = rec .id .replace ('protein|' , '' ).split (' ' )[0 ]
1349
+ else :
1350
+ ID = rec .id .split (' ' )[0 ]
1351
+ rec .id = ID
1352
+ rec .name = ''
1353
+ rec .description = ''
1354
+ SeqIO .write (rec , outfile , 'fasta' )
1355
+
1356
+
1337
1357
def RemoveBadModels (proteins , gff , length , repeats , BlastResults , tmpdir , output ):
1338
1358
#first run bedtools to intersect models where 90% of gene overlaps with repeatmasker region
1339
1359
repeat_temp = os .path .join (tmpdir , 'genome.repeats.to.remove.gff' )
0 commit comments