Skip to content

Commit f59fd25

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
fix for diff versions of GAG outputing different protein names
1 parent 96ec504 commit f59fd25

File tree

2 files changed

+25
-2
lines changed

2 files changed

+25
-2
lines changed

bin/funannotate-predict.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,10 @@ def __init__(self, prog):
10341034
cmd = ['gag.py', '-f', MaskGenome, '-g', GFF, '-o', gag1dir,'--fix_start_stop']
10351035
lib.runSubprocess(cmd, '.', lib.log)
10361036
GAG_gff = os.path.join(gag1dir, 'genome.gff')
1037-
GAG_proteins = os.path.join(gag1dir, 'genome.proteins.fasta')
1037+
GAG_proteins_original = os.path.join(gag1dir, 'genome.proteins.fasta')
1038+
GAG_proteins = os.path.join(args.out, 'predict_misc', 'gag1.proteins.fasta')
1039+
#clean up GAG proteins so names are consistent between versions
1040+
lib.GAGprotClean(GAG_proteins_original, GAG_proteins)
10381041
total = lib.countGFFgenes(GAG_gff)
10391042
lib.log.info('{0:,}'.format(total) + ' total gene models')
10401043

lib/library.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -1333,7 +1333,27 @@ def gb2smurf(input, prot_out, smurf_out):
13331333
smurf.write("%s\t%s\t%s\t%s\t%s\n" % (locus_tag, name.lstrip("0"), int(mystart), int(myend), product_name))
13341334
else:
13351335
smurf.write("%s\t%s\t%s\t%s\t%s\n" % (locus_tag, name.lstrip("0"), int(myend), int(mystart), product_name))
1336-
1336+
1337+
def GAGprotClean(input, output):
1338+
'''
1339+
gag.py v1 had headers like:
1340+
>>evm.model.Contig100.1 protein
1341+
gag.py v2 has headers like:
1342+
>protein|evm.model.scaffold_1.169 ID=evm.model.scaffold_1.169|Parent=evm.TU.scaffold_1.169|Name=EVM%20prediction%20scaffold_1.169
1343+
'''
1344+
with open(output, 'w') as outfile:
1345+
with open(input, 'ru') as infile:
1346+
for rec in SeqIO.parse(infile, 'fasta'):
1347+
if rec.id.startswith('protein|'):
1348+
ID = rec.id.replace('protein|', '').split(' ')[0]
1349+
else:
1350+
ID = rec.id.split(' ')[0]
1351+
rec.id = ID
1352+
rec.name = ''
1353+
rec.description = ''
1354+
SeqIO.write(rec, outfile, 'fasta')
1355+
1356+
13371357
def RemoveBadModels(proteins, gff, length, repeats, BlastResults, tmpdir, output):
13381358
#first run bedtools to intersect models where 90% of gene overlaps with repeatmasker region
13391359
repeat_temp = os.path.join(tmpdir, 'genome.repeats.to.remove.gff')

0 commit comments

Comments
 (0)