abaumannii-pangenome/createSA.py at main · AbeelLab/abaumannii-pangenome · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
import os, re, sys
import pandas as pd
import networkx as nx
if __name__ == '__main__':

	panaroodir = sys.argv[1]
	ptolemydir = sys.argv[2]

	# Load and process panaroo outputs
	panaroomap = os.path.join(panaroodir, 'gene_data.csv')
	panaroograph = os.path.join(panaroodir, 'final_graph.gml')
	centroid2Loc = pd.read_csv(panaroomap, index_col=2, header=0, \
        	                    names=['strain','location','clusterID','annot','protseq',\
                                'dnaseq','gene','desc']).drop(columns=['desc'])
	centroid2Loc.update(centroid2Loc.strain.apply(lambda x: x.replace('_reformatted','')))
	g = nx.read_gml(panaroograph, label='id')
	g = nx.relabel_nodes(g,int)
 	panaroodf = pd.DataFrame([v for k,v in g.nodes.items()], index=g.nodes())
	panaroodf.loc[:,'locustag'] = panaroodf.apply(lambda x: \
                                            centroid2Loc.loc[x.seqIDs].annot \
                                            if isinstance(x.seqIDs,str) \
                                            else centroid2Loc.loc[x.seqIDs].annot.values, axis=1)

	# Load and process ptolemy outputs
 	ptolemymap = os.path.join(ptolemydir, 'orf2id_mapping.txt')
 	safile = os.path.join(ptolemydir, 'syntenic_anchors.txt')

 	id2orf = pd.read_table(ptolemymap, delimiter='\t', header=None, index_col=2, \
			       names=['orf', 'strain', 'id'])
 	id2orf['locustag'] = id2orf.apply(lambda x: re.sub('^'+x.strain+'_', '', x.orf), axis=1)
 	id2orf['locustag'].update(id2orf.locustag.apply(lambda x: '_'.join(x.split('_')[1:-2]) \
							if len(x.split('_'))<8 else \
							'_'.join(x.split('_')[2:-2])))


 	# Match the panaroo outputs to ptolemys ORF indices
 	keeplocus = set(sum(panaroodf.locustag.apply(lambda x: [x] if isinstance(x,str) \
						     else list(x)),[]))
 	keeplocus = set(id2orf.locustag.values).intersection(keeplocus)
 	df = id2orf[id2orf.apply(lambda x: x.locustag in keeplocus, axis=1)].dropna()
 	df['newid'] = df.index # df with common ORFs, renumbered (0-based)
 	df.to_csv(ptolemymap, sep='\t', columns=['orf','strain','newid'], header=False, index=False)
 	id2orf = df.set_index('locustag')

 	# Remove ORFs discarded by panaroo from all ptolemy index files in the database
 	keepID = df.index
 	df = pd.read_table(os.path.join(ptolemydir,'id2fasta.txt'), delimiter='\t', header=None, \
			   index_col=0, names=['id','seq']).loc[keepIDs]
 	df.to_csv(os.path.join(ptolemydir,'id2fasta.txt'), sep='\t', header=False, index=True)
 	df = pd.read_table(os.path.join(ptolemydir,'global_z.txt'), delimiter='\t', header=None, \
			   index_col=0).apply(lambda x: set(map(int,x[1].split(','))), axis=1)
 	df = df.apply(lambda x: ','.join(map(str,x.intersection(keepID))))
 	df = df.drop(labels=df[df.apply(lambda x: not(x))].index)
 	df.to_csv(os.path.join(ptolemydir,'global_z.txt'), sep='\t', header=False, index=True)
 	df = pd.read_table(os.path.join(ptolemydir,'global_z_prime.txt'), delimiter='\t', header=None, \
			   index_col=0)
 	df.loc[keepID].dropna()[1].to_csv(os.path.join(ptolemydir,'global_z_prime.txt'), \
					  sep='\t', header=False, index=True)

 	# Writing out the syntenic anchor file
 	df = panaroodf.apply(lambda x: id2orf.loc[set(x.locustag).intersection(keeplocus),'newid'].values,axis=1)
 	anchors = df.apply(lambda x: ''.join(['\t'.join([str(mem), \
			','.join(map(str,set(x).difference([mem])))+'\n']) \
			for mem in x if len(x)>1]))
 	anchors[anchors.apply(lambda x: len(x)==0)] = np.nan
 	anchors.dropna().to_csv(safile, sep='\t', header=False, index=False, na_rep=None)