Skip to content

Commit 90a098d

Browse files
Fix with BAM having GN:- tag for not genic reads
1 parent 380b619 commit 90a098d

File tree

1 file changed

+298
-0
lines changed

1 file changed

+298
-0
lines changed

SoloTE_1.02/SoloTE_pipeline.py

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
#SoloTE v1
2+
3+
import pysam
4+
import sys
5+
import re
6+
import os
7+
import pandas as pd
8+
from datetime import datetime
9+
from multiprocessing import Pool
10+
import subprocess
11+
from pathlib import Path
12+
import shutil
13+
14+
starting_time = datetime.now()
15+
16+
starting_time_formatted = starting_time.strftime("%H:%M:%S")
17+
print("SoloTE started at "+ starting_time_formatted)
18+
19+
#validate PATH requirements
20+
required_software = ['samtools','bedtools']
21+
for software in required_software:
22+
if shutil.which(software) is None:
23+
print(software+" is not in your PATH environment variable. Add it to your path before running SoloTE")
24+
exit()
25+
else:
26+
print(software+" found!")
27+
28+
29+
def processPerChrom(chromosome,inputfile,outdir,outbasename):
30+
count = 0
31+
print(f'Process: {os.getpid()}')
32+
33+
countsname = outbasename+"_countpercell_"+chromosome+".counts"
34+
if os.path.exists(countsname):
35+
print(countsname+" exists. Skipping this file")
36+
return(countsname)
37+
38+
os.system("samtools view "+inputfile+" "+chromosome+"| awk 'BEGIN{FS=OFS=\"\t\"}{gn=\"\";gx=\"\";cb=\"\";ub=\"\";for(i=12;i<=NF;i++){if($i~ /CB:/){sub(\".*:\",\"\",$i);cb=$i};if($i~ /UB:/){sub(\".*:\",\"\",$i);ub=$i};if($i~ /GX:/){gx=$i};if($i~ /GN:/){sub(\"GN:Z:\",\"\",$i);gn=$i} }; if(cb!=\"-\" && ub != \"-\"){print gn,cb,ub}}'|sort -t'\t' -k1,1 -k2,2 -k3,3|bedtools groupby -g 1,2 -c 3 -o count_distinct > "+countsname)
39+
40+
linenumber = subprocess.run(["wc","-l",countsname],stdout=subprocess.PIPE,text=True)
41+
if re.match("^0",linenumber.stdout):
42+
print(linenumber.stdout)
43+
else:
44+
return(countsname)
45+
46+
47+
inputfile = sys.argv[1]
48+
headerlines = pysam.view("-H",inputfile).split("\n")
49+
chrnames = []
50+
for headerline in headerlines:
51+
# if "@SQ" in headerline and "chr" in headerline:
52+
if "@SQ" in headerline:
53+
split_sq = headerline.split("\t")
54+
chrname = re.sub("SN:","",split_sq[1])
55+
chrnames.append(chrname)
56+
print(chrnames)
57+
co_line = (list(filter(lambda x:'@CO' in x,headerlines)))
58+
co_line = co_line[0]
59+
print(co_line.split("--"))
60+
61+
62+
outsammultnmax_value = (list(filter(lambda x:'outSAMmultNmax 1' in x,co_line.split("--"))))
63+
#print(len(outsammultnmax_value))
64+
if len(outsammultnmax_value)!=1:
65+
print("outSAMmultNmax 1 option missing in BAM file")
66+
# exit()
67+
68+
##samtags_from_star = (list(filter(lambda x:'outSAMattributes' in x,co_line.split("--"))))
69+
##print(samtags_from_star[0])
70+
##
71+
##if "CB" in samtags_from_star[0] and "UB" in samtags_from_star[0]:
72+
## print("CB and UB tags present in BAM file")
73+
##else:
74+
## print("CB and UB tags not present in BAM file")
75+
## exit()
76+
77+
cpus = sys.argv[2]
78+
outbase = sys.argv[3]
79+
TE_bed = sys.argv[4]
80+
81+
outprefix = sys.argv[5]
82+
83+
result_list = []
84+
85+
outdir = outbase+"/"+outprefix+"_SoloTE_temp"
86+
87+
workingdir = os.getcwd()
88+
#finaldir = os.getcwd()+"/"+outbase+"_SoloTE_output"
89+
finaldir = outbase+"/"+outprefix+"_SoloTE_output"
90+
print(finaldir)
91+
print(outdir)
92+
print(outbase)
93+
print(outprefix)
94+
95+
outbase = outprefix
96+
97+
inputbam = os.path.abspath(inputfile)
98+
bam_without_genes=outbase+"_nogenes.bam"
99+
bam_without_genes_newtag=outbase+"_nogenes_newtag.bam"
100+
bam_without_genes_oldtag=outbase+"_nogenes_oldtag.bam"
101+
temp_bam="temp.bam"
102+
te_bam=outbase+"_nogenes_overlappingtes.bam"
103+
te_bamAsbed=outbase+"_nogenes_overlappingtes.bed"
104+
selected_TEs=outbase+"_selectedtes.bed"
105+
gene_bam=outbase+"_genes.bam"
106+
full_bam=outbase+"_full.bam"
107+
full_sortedbam=outbase+"_full_sorted.bam"
108+
final_bam=outbase+"_final.bam"
109+
annotated_te_bam=outbase+"_teannotated.bam"
110+
111+
SoloTE_Home=os.path.dirname(__file__)
112+
113+
Path(outdir).mkdir(parents=True, exist_ok=True)
114+
os.chdir(outdir)
115+
116+
if os.path.exists(bam_without_genes) and os.path.exists(gene_bam):
117+
print(bam_without_genes+" and "+gene_bam+" exist in output folder. Skipping this step")
118+
else:
119+
cmd="samtools view --threads "+cpus+" -d GN -U "+bam_without_genes_oldtag+" -O BAM -o "+temp_bam+" "+inputbam
120+
print(cmd)
121+
os.system(cmd)
122+
123+
cmd="samtools view -d GN:- -o "+bam_without_genes_newtag+" -U "+gene_bam+" -O BAM "+temp_bam
124+
print(cmd)
125+
os.system(cmd)
126+
127+
cmd="samtools cat --threads "+cpus+" -o "+bam_without_genes+" "+bam_without_genes_oldtag+" "+bam_without_genes_newtag
128+
print(cmd)
129+
os.system(cmd)
130+
131+
if os.path.exists(te_bam):
132+
print(te_bam+" exists in output folder. Skipping this step")
133+
else:
134+
cmd="samtools view --threads "+cpus+" -O BAM -o "+te_bam+" -L "+TE_bed+" "+bam_without_genes
135+
print(cmd)
136+
os.system(cmd)
137+
cmd="samtools index "+te_bam
138+
print(cmd)
139+
os.system(cmd)
140+
141+
if os.path.exists(te_bamAsbed):
142+
print(te_bamAsbed+" exists in output folder. Skipping this step")
143+
else:
144+
cmd="bedtools bamtobed -i "+te_bam+" -split > "+te_bamAsbed
145+
print(cmd)
146+
os.system(cmd)
147+
148+
if os.path.exists(selected_TEs):
149+
print(selected_TEs+" exists in output folder. Skipping this step")
150+
else:
151+
cmd="bedtools intersect -a "+TE_bed+" -b "+te_bamAsbed+" -u > "+selected_TEs
152+
print(cmd)
153+
os.system(cmd)
154+
155+
if os.path.exists(annotated_te_bam):
156+
print(annotated_te_bam+" exists in output folder. Skipping this step")
157+
else:
158+
annotateBAMpath=SoloTE_Home+"/annotateBAM.py"
159+
cmd="python "+annotateBAMpath+" "+te_bam+" "+selected_TEs+" "+annotated_te_bam
160+
print(cmd)
161+
os.system(cmd)
162+
163+
if os.path.exists(full_bam):
164+
print(full_bam+" exists in output folder. Skipping this step")
165+
else:
166+
cmd="samtools cat --threads "+cpus+" -o "+full_bam+" "+gene_bam+" "+annotated_te_bam
167+
print(cmd)
168+
os.system(cmd)
169+
170+
if os.path.exists(full_sortedbam):
171+
print(full_sortedbam+" exists in output folder. Skipping this step")
172+
else:
173+
cmd="samtools sort --threads "+cpus+" -O BAM -o "+full_sortedbam+" "+full_bam
174+
print(cmd)
175+
os.system(cmd)
176+
177+
if os.path.exists(final_bam):
178+
print(final_bam+" exists in output folder. Skipping this step")
179+
else:
180+
cmd="samtools view -U "+final_bam+" --tag CB:- -@ 8 "+full_sortedbam+" > /dev/null"
181+
print(cmd)
182+
os.system(cmd)
183+
cmd="samtools index "+final_bam
184+
print(cmd)
185+
os.system(cmd)
186+
187+
188+
pool = Pool(processes=int(cpus))
189+
#Generate counts
190+
for chromosome in chrnames:
191+
pool.apply_async(processPerChrom,args=(chromosome,final_bam,outdir,outbase),callback=result_list.append)
192+
193+
pool.close()
194+
pool.join()
195+
print(result_list)
196+
197+
allcountsfile = outbase+"_allcounts.txt"
198+
199+
if os.path.exists(allcountsfile):
200+
print(allcountsfile+" exists. Will be removed")
201+
os.remove(allcountsfile)
202+
203+
for countfile in result_list:
204+
if countfile is not None:
205+
os.system("cat "+countfile+" >> "+allcountsfile)
206+
207+
genes_tsv_filename = outbase +"_genes.tsv"
208+
barcodes_tsv_filename = outbase+"_barcodes.tsv"
209+
210+
genes_filename1 = outbase + "_genes.txt"
211+
telocus_filename1 = outbase + "_locustes.txt"
212+
tesubf_filename1 = outbase + "_subftes.txt"
213+
214+
os.system("grep -v \"SoloTE\" "+allcountsfile+" > "+genes_filename1)
215+
os.system("grep \"SoloTE\" "+allcountsfile+"|grep \"chr\" > "+telocus_filename1)
216+
os.system("grep \"SoloTE\" "+allcountsfile+"|grep -v \"chr\" > "+tesubf_filename1)
217+
218+
219+
new_outfiles = []
220+
221+
for filename in [genes_filename1,telocus_filename1,tesubf_filename1]:
222+
outfile = re.sub(".txt","_2.txt",filename)
223+
new_outfiles.append(outfile)
224+
os.system("sort -t'\t' -k1,1 -k2,2 -k3,3 "+filename+"|bedtools groupby -g 1,2 -c 3 -o sum > "+outfile)
225+
226+
final_countsfile = outbase + "_allcounts_final.txt"
227+
te_threshold = 3
228+
os.system("cat "+new_outfiles[0]+"> "+final_countsfile)
229+
os.system("cat "+new_outfiles[1]+" "+new_outfiles[2]+"|awk '($NF>="+str(te_threshold)+"){print $0}' >> "+final_countsfile)
230+
231+
os.system("awk '{print $1}' "+final_countsfile+"|sort -u|awk 'BEGIN{FS=OFS=\"\t\"}{print $1,$1}' > "+genes_tsv_filename)
232+
os.system("awk '{print $2}' "+final_countsfile+"|sort -u > "+barcodes_tsv_filename)
233+
234+
235+
genenumber = subprocess.run(["wc","-l",genes_tsv_filename],stdout=subprocess.PIPE,text=True)
236+
barcodenumber = subprocess.run(["wc","-l",barcodes_tsv_filename],stdout=subprocess.PIPE,text=True)
237+
allcounts_number = subprocess.run(["wc","-l",final_countsfile],stdout=subprocess.PIPE,text=True)
238+
marketmatrix_line3 = genenumber.stdout.split(" ")[0]+" "+barcodenumber.stdout.split(" ")[0]+" "+allcounts_number.stdout.split(" ")[0]
239+
240+
output_marketmatrix_filename = outbase+"_allcounts_matrix.mtx"
241+
os.system("echo \"%%MatrixMarket matrix coordinate integer general\n%\" > "+output_marketmatrix_filename)
242+
os.system("echo "+marketmatrix_line3+" >> "+output_marketmatrix_filename)
243+
generateMatrix_Rscript=SoloTE_Home+"/generate_mtx.R"
244+
os.system("Rscript "+generateMatrix_Rscript+" "+barcodes_tsv_filename+" "+genes_tsv_filename+" "+final_countsfile+" "+output_marketmatrix_filename)
245+
246+
print("Creating final results directory")
247+
Path(finaldir).mkdir(parents=True, exist_ok=True)
248+
print(finaldir+" was created")
249+
250+
os.rename(barcodes_tsv_filename,finaldir+"/barcodes.tsv")
251+
os.rename(genes_tsv_filename,finaldir+"/features.tsv")
252+
os.rename(output_marketmatrix_filename,finaldir+"/matrix.mtx")
253+
254+
##Generate TE statistics
255+
outstats_basename = outbase+"_SoloTE.stats"
256+
outstats_filename = workingdir+"/"+outstats_basename
257+
258+
input_te_file = re.sub(".txt","_2.txt",telocus_filename1)
259+
file_table = pd.read_table(input_te_file,header=None,sep="\t")
260+
tes_ls_n = str(len(file_table[0].unique()))
261+
tes_ls_celln = str(len(file_table[1].unique()))
262+
tes_ls_totalumicount = str(file_table[2].sum())
263+
264+
265+
threshold_table = file_table[file_table[2]>=te_threshold]
266+
tes_threshold_ls_n = str(len(threshold_table[0].unique()))
267+
tes_threshold_ls_celln = str(len(threshold_table[1].unique()))
268+
tes_threshold_ls_totalumicount = str(threshold_table[2].sum())
269+
270+
input_te_file = re.sub(".txt","_2.txt",tesubf_filename1)
271+
file_table = pd.read_table(input_te_file,header=None,sep="\t")
272+
tes_subf_n = str(len(file_table[0].unique()))
273+
tes_subf_celln = str(len(file_table[1].unique()))
274+
tes_subf_totalumicount = str(file_table[2].sum())
275+
276+
threshold_table = file_table[file_table[2]>=te_threshold]
277+
tes_threshold_subf_n = str(len(threshold_table[0].unique()))
278+
tes_threshold_subf_celln = str(len(threshold_table[1].unique()))
279+
tes_threshold_subf_totalumicount = str(threshold_table[2].sum())
280+
281+
print("Creating "+outstats_basename+" TE statistics file")
282+
outstats_filehandle = open(outstats_filename,"w")
283+
outstats_filehandle.write("Category\tTotal TEs\tTotal cells\tTotal UMIs\n")
284+
outstats_filehandle.write("Locus-specific TEs\t"+tes_ls_n+"\t"+tes_ls_celln+"\t"+tes_ls_totalumicount+"\n")
285+
outstats_filehandle.write("Locus-specific TEs (UMIs >= "+str(te_threshold)+")\t"+tes_threshold_ls_n+"\t"+tes_threshold_ls_celln+"\t"+tes_threshold_ls_totalumicount+"\n")
286+
outstats_filehandle.write("Subfamily-specific TEs\t"+tes_subf_n+"\t"+tes_subf_celln+"\t"+tes_subf_totalumicount+"\n")
287+
outstats_filehandle.write("Subfamily-specific TEs (UMIs >= "+str(te_threshold)+")\t"+tes_threshold_subf_n+"\t"+tes_threshold_subf_celln+"\t"+tes_threshold_subf_totalumicount+"\n")
288+
outstats_filehandle.close()
289+
print ("Finished creating "+outstats_basename)
290+
291+
print("SoloTE finished with "+inputfile)
292+
293+
finishing_time = datetime.now()
294+
elapsed_time = finishing_time - starting_time
295+
finishing_time_formatted = finishing_time.strftime("%H:%M:%S")
296+
print("SoloTE finished at "+finishing_time_formatted)
297+
print("SoloTE total running time: "+str(elapsed_time))
298+

0 commit comments

Comments
 (0)