forked from BenjaminGuinet/HGT-finder-between-2-species
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_dataframe_candidates.py
100 lines (71 loc) · 4.43 KB
/
get_dataframe_candidates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import itertools
from itertools import *
from Bio import SeqIO
import numpy as np
import argparse
import sys
# This script allows you to merge you two blasy hit files with taxonomic informations with information about distances, GC, coverage etc.
parser = argparse.ArgumentParser(description='Merge and get new dataframe of genes candidates')
parser.add_argument("-c", "--candidates_sp1_sp2", help="introduce de dataframe with informations about dn_ds, coverage etc (output from gff_cov_GC.py)")
parser.add_argument("-b1", "--blast_sp1", help="introduce the blast file with all best hits and taxid inf of the specie1")
parser.add_argument("-b2", "--blast_sp2", help="introduce the blast file with all best hits and taxid ing of the specie2")
parser.add_argument("-s1", "--specie1", help="introduce the name of the specie 1")
parser.add_argument("-s2", "--specie2", help="introduce the name of the specie 2")
args = parser.parse_args()
candidates_sp1_sp2=args.candidates_sp1_sp2
blast_sp1=args.blast_sp1
blast_sp2=args.blast_sp2
specie1=args.specie1 #espèce 1
specie2=args.specie2 #espèce 2
#Ex Usage:
#python3 get_dataframe_candidates.py -c candidates_genes_pvalue_0035_0042.tab -b1 outfile_0035.tab_top_blast_hits.csv -b2 outfile_0042.tab_top_blast_hits.csv -s1 0035 -s2 0042
augustus_cluster=pd.read_csv(candidates_sp1_sp2,header=0,sep='\t')
blast_sp1=pd.read_table(blast_sp1,header=0,sep=',')
blast_sp2=pd.read_table(blast_sp2,header=0,sep=',')
data = augustus_cluster.merge(blast_sp1, left_on='seq1_id', right_on='qseqid')
data = data.merge(blast_sp2, left_on='seq2_id', right_on='qseqid')
Complet_data=data.drop(columns=["Unnamed: 0", "Mean_length", 'GC_content_seq1','GC_content_seq2','length_x','mismatch_x','gapopen_x','qstart_x','qend_x','sstart_x','send_x',
'bitscore_x','length_y','mismatch_y','gapopen_y','qstart_y','qend_y','sstart_y','send_y','bitscore_y'])
Complet_data.columns = ['seq1_id','seq2_id','dN','dS','Dist_third_pos','Dist_brute',
'Length_seq_1','Length_seq_2','scaf_name_seq1','GC_scaff_seq1','cov_depth_seq1','scaf_name_seq2','GC_scaff_seq2','cov_depth_seq2',
'qseqid_sp1','sseqid_sp1','pident_sp1','evalue_sp1','salltitles_sp1','staxids_sp1','scientific_name_sp1','scomnames_sp1','sskingdoms_sp1','Order_sp1','qseqid_sp2',
'sseqid_sp2','pident_sp2','evalue_sp2','salltitles_sp2','staxids_sp2','scientific_name_sp2','scomnames_sp2','sskingdoms_sp2','Order_sp2']
#Creat a new dataframe with all informations of these candidates paired sequences:
#Complet_data.to_csv("Complet_candidates_tax.tab",sep='\t')
Summarize_data=Complet_data.drop(columns=['dN','Dist_third_pos','Dist_brute','Length_seq_1','Length_seq_2','scaf_name_seq1','GC_scaff_seq1','cov_depth_seq1','scaf_name_seq2','GC_scaff_seq2','cov_depth_seq2',
'qseqid_sp1','pident_sp1','staxids_sp1','staxids_sp2','scomnames_sp1','scomnames_sp2','qseqid_sp2','pident_sp2','sskingdoms_sp1','sskingdoms_sp2'])
#Creat a new dataframe with a summary of principals informations of the candidates paired sequences:
#Summarize_data.to_csv("Summarize_candidates_tax.tab",sep='\t')
#This part allows you to count how many candidats genes have been found depending on the trainings parameters:
count_sp1_sp1=0
count_sp1_sp2=0
count_sp2_sp2=0
count_sp2_sp1=0
count=0
for i in Summarize_data['seq1_id']:
count+=1
if '_'+specie1+'_'+specie1 in i:
count_sp1_sp1 +=1
if '_'+specie1+'_'+specie2 in i:
count_sp1_sp2 +=1
for i in Summarize_data['seq2_id']:
if '_'+specie2+'_'+specie2 in i:
count_sp2_sp2 +=1
if '_'+specie2+'_'+specie1 in i:
count_sp2_sp1 +=1
candidates=pd.read_csv(candidates_sp1_sp2,header=0,sep='\t')
All_candidates = candidates.merge(Complet_data, how='left')
All_candidates.to_csv("All_candidates_with_all_inf.tab",sep='\t')
Summarize_all_candidates = candidates.merge(Summarize_data, how='left')
Summarize_all_candidates.to_csv("Summarize_All_candidates_with_all_inf.tab",sep='\t')
print("Nb_",specie1,"_",specie1," :",count_sp1_sp1,sep="")
print("Nb_",specie1,"_",specie2," :",count_sp1_sp2,sep="")
print("Nb_",specie2,"_",specie2," :",count_sp2_sp2,sep="")
print("Nb_",specie2,"_",specie1," :",count_sp2_sp1,sep="")
print('Nb total :',count_sp1_sp1+count_sp1_sp2+count_sp2_sp2+count_sp2_sp1)
candidates_sp2 = pd.DataFrame(augustus_cluster['seq2_id'])
print(candidates_sp2)
#Allows to only keep sequences after the coverage filter:
result=All_candidates.merge(candidates_sp2, left_on='seq2_id', right_on='seq2_id')
result.to_csv("All_candidates_cov_all_inf.tab",sep='\t')