Skip to content

Commit 218edc4

Browse files
author
Jon Palmer
committed
add adapters and prep for eggnog2
1 parent 0f1bdbb commit 218edc4

File tree

3 files changed

+68
-17
lines changed

3 files changed

+68
-17
lines changed

funannotate/annotate.py

+60-17
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def MEROPSBlast(input, cpus, evalue, tmpdir, output, diamond=True):
3636
lib.runSubprocess(cmd, '.', lib.log)
3737
# parse results
3838
with open(output, 'w') as out:
39-
with open(blast_tmp, 'rU') as results:
39+
with open(blast_tmp, 'r') as results:
4040
for qresult in SearchIO.parse(results, "blast-xml"):
4141
hits = qresult.hits
4242
ID = qresult.id
@@ -66,7 +66,7 @@ def SwissProtBlast(input, cpus, evalue, tmpdir, GeneDict, diamond=True):
6666
# parse results
6767
counter = 0
6868
total = 0
69-
with open(blast_tmp, 'rU') as results:
69+
with open(blast_tmp, 'r') as results:
7070
for qresult in SearchIO.parse(results, "blast-xml"):
7171
hits = qresult.hits
7272
qlen = qresult.seq_len
@@ -155,7 +155,7 @@ def getEggNogHeaders(input):
155155
12 eggNOG annot
156156
'''
157157
IDi, DBi, OGi, Genei, COGi, Desci = (None,)*6
158-
with open(input, 'rU') as infile:
158+
with open(input, 'r') as infile:
159159
for line in infile:
160160
if line.startswith('#query_name'): # this is HEADER
161161
line = line.rstrip()
@@ -171,14 +171,57 @@ def getEggNogHeaders(input):
171171
IDi, DBi, OGi, Genei, COGi, Desci = (0, 8, 9, 4, 11, 12)
172172
return IDi, DBi, OGi, Genei, COGi, Desci
173173

174+
def getEggNogHeadersv2(input):
175+
'''
176+
function to get the headers from eggnog mapper annotations
177+
web-based eggnog mapper has no header....
178+
1. query_name
179+
2. seed eggNOG ortholog
180+
3. seed ortholog evalue
181+
4. seed ortholog score
182+
5. Predicted taxonomic group
183+
6. Predicted protein name
184+
7. Gene Ontology terms
185+
8. EC number
186+
9. KEGG_ko
187+
10. KEGG_Pathway
188+
11. KEGG_Module
189+
12. KEGG_Reaction
190+
13. KEGG_rclass
191+
14. BRITE
192+
15. KEGG_TC
193+
16. CAZy
194+
17. BiGG Reaction
195+
18. tax_scope: eggNOG taxonomic level used for annotation
196+
19. eggNOG OGs
197+
20. bestOG (deprecated, use smallest from eggnog OGs)
198+
21. COG Functional Category
199+
22. eggNOG free text description
200+
'''
201+
IDi, DBi, OGi, Genei, COGi, Desci = (None,)*6
202+
with open(input, 'r') as infile:
203+
for line in infile:
204+
if line.startswith('#query_name'): # this is HEADER
205+
line = line.rstrip()
206+
headerCols = line.split('\t')
207+
IDi = item2index(headerCols, 'query_name')
208+
Genei = item2index(headerCols, 'Preferred_name')
209+
DBi = item2index(headerCols, 'taxonomic scope')
210+
OGi = item2index(headerCols, 'eggNOG OGs')
211+
COGi = item2index(headerCols, 'COG Functional cat.')
212+
Desci = item2index(headerCols, 'eggNOG free text desc.')
213+
break
214+
if not IDi: # then no header file, so have to guess
215+
IDi, DBi, OGi, Genei, COGi, Desci = (0, 6, 9, 4, 11, 12)
216+
return IDi, DBi, OGi, Genei, COGi, Desci
174217

175218
def parseEggNoggMapper(input, output, GeneDict):
176219
Definitions = {}
177220
# indexes from header file
178221
IDi, DBi, OGi, Genei, COGi, Desci = getEggNogHeaders(input)
179222
# take annotations file from eggnog-mapper and create annotations
180223
with open(output, 'w') as out:
181-
with open(input, 'rU') as infile:
224+
with open(input, 'r') as infile:
182225
for line in infile:
183226
line = line.replace('\n', '')
184227
if line.startswith('#'):
@@ -330,7 +373,7 @@ def __init__(self, prog):
330373
lib.log.error('Database not properly configured, %s missing. Run funannotate database and/or funannotate setup.' %
331374
os.path.join(FUNDB, 'funannotate-db-info.txt'))
332375
sys.exit(1)
333-
with open(os.path.join(FUNDB, 'funannotate-db-info.txt'), 'rU') as dbfile:
376+
with open(os.path.join(FUNDB, 'funannotate-db-info.txt'), 'r') as dbfile:
334377
for line in dbfile:
335378
line = line.strip()
336379
name, type, file, version, date, num_records, mdchecksum = line.split(
@@ -526,7 +569,7 @@ def __init__(self, prog):
526569
genbank)
527570
# since can't find a way to propage the WGS_accession, writing to a file and then parse here
528571
if os.path.isfile(os.path.join(outputdir, 'update_results', 'WGS_accession.txt')):
529-
with open(os.path.join(outputdir, 'update_results', 'WGS_accession.txt'), 'rU') as infile:
572+
with open(os.path.join(outputdir, 'update_results', 'WGS_accession.txt'), 'r') as infile:
530573
for line in infile:
531574
line = line.replace('\n', '')
532575
if line == 'None':
@@ -646,7 +689,7 @@ def __init__(self, prog):
646689
lib.log.info("Combining UniProt/EggNog gene and product names using Gene2Product version %s" %
647690
versDB.get('gene2product'))
648691
CuratedNames = {}
649-
with open(os.path.join(FUNDB, 'ncbi_cleaned_gene_products.txt'), 'rU') as input:
692+
with open(os.path.join(FUNDB, 'ncbi_cleaned_gene_products.txt'), 'r') as input:
650693
for line in input:
651694
line = line.strip()
652695
if line.startswith('#'):
@@ -937,7 +980,7 @@ def __init__(self, prog):
937980

938981
# to update annotations, user can pass --fix or --remove, update Annotations here
939982
if args.fix:
940-
with open(args.fix, 'rU') as fixfile:
983+
with open(args.fix, 'r') as fixfile:
941984
for line in fixfile:
942985
line = line.strip()
943986
if line.startswith('#'):
@@ -958,7 +1001,7 @@ def __init__(self, prog):
9581001
Gene2ProdFinal[cols[0]] = (cols[1], cols[2])
9591002

9601003
if args.remove:
961-
with open(args.remove, 'rU') as removefile:
1004+
with open(args.remove, 'r') as removefile:
9621005
for line in removefile:
9631006
line = line.strip()
9641007
if line.startswith('#'):
@@ -988,13 +1031,13 @@ def __init__(self, prog):
9881031
if args.p2g:
9891032
p2gfile = args.p2g
9901033
if p2gfile:
991-
with open(p2gfile, 'rU') as input:
1034+
with open(p2gfile, 'r') as input:
9921035
for line in input:
9931036
cols = line.split('\t')
9941037
if not cols[0] in p2g:
9951038
p2g[cols[0]] = cols[1]
9961039
with open(os.path.join(outputdir, 'annotate_misc', 'tbl2asn', 'genome.tbl'), 'w') as outfile:
997-
with open(os.path.join(outputdir, 'annotate_misc', 'tbl2asn', 'genome.tbl.bak'), 'rU') as infile:
1040+
with open(os.path.join(outputdir, 'annotate_misc', 'tbl2asn', 'genome.tbl.bak'), 'r') as infile:
9981041
for line in infile:
9991042
line = line.replace('\n', '')
10001043
if line.startswith('\t\t\tprotein_id') or line.startswith('\t\t\ttranscript_id'):
@@ -1147,7 +1190,7 @@ def __init__(self, prog):
11471190
AntiSmashFolder, 'smcluster.MIBiG.blast.txt')
11481191
mibig_db = os.path.join(FUNDB, 'mibig.dmnd')
11491192
with open(mibig_fasta, 'w') as output:
1150-
with open(Proteins, 'rU') as input:
1193+
with open(Proteins, 'r') as input:
11511194
SeqRecords = SeqIO.parse(Proteins, 'fasta')
11521195
for record in SeqRecords:
11531196
genename = record.id
@@ -1160,7 +1203,7 @@ def __init__(self, prog):
11601203
lib.runSubprocess(cmd, '.', lib.log)
11611204
# now parse blast results to get {qseqid: hit}
11621205
MIBiGBlast = {}
1163-
with open(mibig_blast, 'rU') as input:
1206+
with open(mibig_blast, 'r') as input:
11641207
for line in input:
11651208
cols = line.split('\t')
11661209
if '-T' in cols[0]:
@@ -1180,15 +1223,15 @@ def __init__(self, prog):
11801223

11811224
# load in antismash cluster bed file to slice record
11821225
slicing = []
1183-
with open(AntiSmashBed, 'rU') as antibed:
1226+
with open(AntiSmashBed, 'r') as antibed:
11841227
for line in antibed:
11851228
cols = line.split('\t')
11861229
# chr, cluster, start, stop in a tuple
11871230
cluster = (cols[0], cols[3], cols[1], cols[2])
11881231
slicing.append(cluster)
11891232
Offset = {}
11901233
# Get each cluster + 15 Kb in each direction to make sure you can see the context of the cluster
1191-
with open(os.path.join(ResultsFolder, organism_name+'.gbk'), 'rU') as gbk:
1234+
with open(os.path.join(ResultsFolder, organism_name+'.gbk'), 'r') as gbk:
11921235
SeqRecords = SeqIO.parse(gbk, 'genbank')
11931236
for record in SeqRecords:
11941237
for f in record.features:
@@ -1221,7 +1264,7 @@ def __init__(self, prog):
12211264
output.write("#%s\n" % base)
12221265
output.write(
12231266
"#GeneID\tChromosome:start-stop\tStrand\tClusterPred\tBackbone Enzyme\tBackbone Domains\tProduct\tsmCOGs\tEggNog\tInterPro\tPFAM\tGO terms\tNotes\tMIBiG Blast\tProtein Seq\tDNA Seq\n")
1224-
with open(file, 'rU') as input:
1267+
with open(file, 'r') as input:
12251268
SeqRecords = SeqIO.parse(input, 'genbank')
12261269
for record in SeqRecords:
12271270
for f in record.features:
@@ -1333,7 +1376,7 @@ def __init__(self, prog):
13331376
finallist.append(file)
13341377
with open(ClustersOut, 'w') as output:
13351378
for file in natsorted(finallist):
1336-
with open(file, 'rU') as input:
1379+
with open(file, 'r') as input:
13371380
output.write(input.read())
13381381
output.write('\n\n')
13391382

funannotate/config/TruSeq3-PE.fa

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>PrefixPE/1
2+
TACACTCTTTCCCTACACGACGCTCTTCCGATCT
3+
>PrefixPE/2
4+
GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT

funannotate/config/TruSeq3-SE.fa

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>TruSeq3_IndexedAdapter
2+
AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
3+
>TruSeq3_UniversalAdapter
4+
AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA

0 commit comments

Comments
 (0)