fix for diff versions of GAG outputing different protein names

Jon Palmer · Jon Palmer · commit f59fd2570b9c · 2017-05-16T15:03:48.000-05:00
diff --git a/bin/funannotate-predict.py b/bin/funannotate-predict.py
@@ -1034,7 +1034,10 @@ def __init__(self, prog):
 cmd = ['gag.py', '-f', MaskGenome, '-g', GFF, '-o', gag1dir,'--fix_start_stop']
 lib.runSubprocess(cmd, '.', lib.log)
 GAG_gff = os.path.join(gag1dir, 'genome.gff')
-GAG_proteins = os.path.join(gag1dir, 'genome.proteins.fasta')
+GAG_proteins_original = os.path.join(gag1dir, 'genome.proteins.fasta')
+GAG_proteins = os.path.join(args.out, 'predict_misc', 'gag1.proteins.fasta')
+#clean up GAG proteins so names are consistent between versions
+lib.GAGprotClean(GAG_proteins_original, GAG_proteins)
 total = lib.countGFFgenes(GAG_gff)
 lib.log.info('{0:,}'.format(total) + ' total gene models')
 
diff --git a/lib/library.py b/lib/library.py
@@ -1333,7 +1333,27 @@ def gb2smurf(input, prot_out, smurf_out):
                                 smurf.write("%s\t%s\t%s\t%s\t%s\n" % (locus_tag, name.lstrip("0"), int(mystart), int(myend), product_name))
                             else:
                                 smurf.write("%s\t%s\t%s\t%s\t%s\n" % (locus_tag, name.lstrip("0"), int(myend), int(mystart), product_name))
-                            
+
+def GAGprotClean(input, output):
+    '''
+    gag.py v1 had headers like:
+    >>evm.model.Contig100.1 protein
+    gag.py v2 has headers like:
+    >protein|evm.model.scaffold_1.169 ID=evm.model.scaffold_1.169|Parent=evm.TU.scaffold_1.169|Name=EVM%20prediction%20scaffold_1.169
+    '''
+    with open(output, 'w') as outfile:
+        with open(input, 'ru') as infile:
+            for rec in SeqIO.parse(infile, 'fasta'):
+                if rec.id.startswith('protein|'):
+                    ID = rec.id.replace('protein|', '').split(' ')[0]
+                else:
+                    ID = rec.id.split(' ')[0]
+                rec.id = ID
+                rec.name = ''
+                rec.description = ''
+                SeqIO.write(rec, outfile, 'fasta')
+                
+                          
 def RemoveBadModels(proteins, gff, length, repeats, BlastResults, tmpdir, output):
     #first run bedtools to intersect models where 90% of gene overlaps with repeatmasker region
     repeat_temp = os.path.join(tmpdir, 'genome.repeats.to.remove.gff')