forked from BenjaminGuinet/HGT-finder-between-2-species
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_silix_merging.py
164 lines (108 loc) · 5.55 KB
/
cluster_silix_merging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import pandas as pd
import itertools
from itertools import *
from Bio import SeqIO
import numpy as np
import sys
import argparse
#Script wich allows to deal with Diamond output (.m8 format from silix), Cluster information (.fnodes from silix) and pident mean by Hsp (.net format from silix)
#The next commande lines will only keep cluster with not self-matchs sequences and the best pairs (max pident) within these clusters.
# Print out a message when the program is initiated.
#print('----------------------------------------------------------------\n')
#print(' Cluster-Silix merging 1.0.\n')
#print('----------------------------------------------------------------\n')
#----------------------------------- PARSE SOME ARGUMENTS ----------------------------------------
parser = argparse.ArgumentParser(description='Merge en keep clusters with the best pident')
parser.add_argument("-b", "--blast", help="introduce the blast output (.m8)")
parser.add_argument("-c", "--cluster", help="introduce the cluster file (.fnodes)")
parser.add_argument("-d", "--distance", help="introduce the distance file (.net)")
parser.add_argument("-f1", "--fasta_aa", help="introduce the fasta file with augustus predicted genes (concatened amino acid)")
parser.add_argument("-f2", "--fasta_dna", help="introduce the fasta file with augustus predicted genes (concatened dna)")
args = parser.parse_args()
# Variable that stores fasta sequences
blast=args.blast
cluster=args.cluster
distance=args.distance
fasta_aa=args.fasta_aa
fasta_dna=args.fasta_dna
#Usage xemple:
#python3 cluster_silix_merging.py -b matches_Augustus_0035_0042.m8 -c cluster_Augustus_0035_0042.fnodes -d matches_Augustus_0035_0042.net -f1 concatenate_0035_0042.faa -f2 concatenate_0035_0042.fna
blast=pd.read_table(blast,header=None)
blast.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen","qstart", "qend", "sstart", "send", "evalue", "bitscore"]
blast=blast.drop(columns=["mismatch", "gapopen", "evalue", "bitscore"])
#Cluster Dataframe
cluster=pd.read_table(cluster,header=None)
cluster.columns = ["cluster_name", "seq_names"]
#Distance mean dataframe
dist=pd.read_table(distance,header=None)
dist.columns = ["qseqid", "sseqid","pident","coverage"]
dist=dist.drop(columns=["coverage"])
#Including cluster information and distance mean information into one dataframe:
data = cluster.merge(dist, left_on='seq_names', right_on='qseqid')
#Adding for each two remaining dataframe a concatened colomn
data["name_concatened"] = data["qseqid"].map(str) + data["sseqid"]
blast["name_concatened"] = blast["qseqid"].map(str) + blast["sseqid"]
#We do not need these columns anymore
blast=blast.drop(columns=[ "qseqid","sseqid"])
#Including cluster information + distance mean information + coordinate sequences from blast into one dataframe:
data = data.merge(blast, left_on='name_concatened', right_on='name_concatened')
data=data.drop(columns=["seq_names","name_concatened"])
data.to_csv("dataframe_brute.txt", sep='\t')
#data should only contain pairings of different sequences:
data = data[data['qseqid']!=data["sseqid"]]
#To ignore pairings which have the same substrings in their seqid, the most readable way would be to add data columns with these data:
data['qspec'] = [seqid.split('_')[1] for seqid in data['qseqid'].values]
data['sspec'] = [seqid.split('_')[1] for seqid in data['sseqid'].values]
data_wo_eqSpec = data[data['qspec']!=data['sspec']]
data_wo_eqSpec.to_csv("dataframe.txt", sep='\t')
df=pd.read_table("dataframe.txt",header=0,sep='\t')
#Sorte sequences
df[['qseqid','sseqid']] = np.sort(df[['qseqid','sseqid']], axis=1)
#droping duplicates
data_wo_eqSpec = df.drop_duplicates(subset=['qseqid','sseqid'])
#In the end the data should be grouped by cluster-ID and within each group, the maximum of pident is of interest:
data_grpd = data_wo_eqSpec.groupby(['cluster_name'])
data_wo_eqSpec.to_csv("dataframe_not_max.txt",sep='\t')
result=data_wo_eqSpec.loc[data_grpd['pident_x'].idxmax()]
result.to_csv("dataframe_max.txt",sep='\t')
df=pd.read_table("dataframe_max.txt",sep="\t")
output_aa_file = open('clusters1_aa.fasta','w')
output_aa_file2 = open('clusters2_aa.fasta','w')
output_dna_file = open('clusters1_dna.fasta','w')
output_dna_file2 = open('custers2_dna.fasta','w')
#Display into 4 files the dna and amino acid sequences:
#record_dict = SeqIO.to_dict(SeqIO.parse("concatenate_with_busco_names_0035_0042_aa.fa", "fasta"))
qseqid=df.ix[:,3]
sseqid=df.ix[:,4]
record_dict = SeqIO.to_dict(SeqIO.parse(fasta_aa, "fasta"))
for a, b in zip(qseqid,sseqid):
if a in record_dict:
print(">",record_dict[a].id,file=output_aa_file,sep="")
print(record_dict[a].seq,file=output_aa_file)
if a not in record_dict:
print(a ,"pas dans dic a_aa",b)
if b in record_dict:
print(">",record_dict[b].id,file=output_aa_file2,sep="")
print(record_dict[b].seq,file=output_aa_file2)
if b not in record_dict:
print(b ,"pas dans dic b_aa",a)
#Fasta file with all the sequences and their ID
record_dict2 = SeqIO.to_dict(SeqIO.parse(fasta_dna, "fasta"))
qseqid=df.ix[:,3]
sseqid=df.ix[:,4]
for a, b, in zip(qseqid,sseqid):
if a in record_dict2:
print(">",record_dict2[a].id,file=output_dna_file)
print(record_dict2[a].seq,file=output_dna_file)
if a not in record_dict2:
print(a ,"pas dans dic a_dna",b)
if b in record_dict2:
print(">",record_dict2[b].id,file=output_dna_file2)
print(record_dict2[b].seq,file=output_dna_file2)
if b not in record_dict2:
print(b," pas dans dic b_dna",a)
output_aa_file.close()
output_aa_file.close()
output_aa_file2.close()
output_dna_file.close()
output_dna_file2.close()