|
| 1 | +#SoloTE v1 |
| 2 | + |
| 3 | +import pysam |
| 4 | +import sys |
| 5 | +import re |
| 6 | +import os |
| 7 | +import pandas as pd |
| 8 | +from datetime import datetime |
| 9 | +from multiprocessing import Pool |
| 10 | +import subprocess |
| 11 | +from pathlib import Path |
| 12 | +import shutil |
| 13 | + |
| 14 | +starting_time = datetime.now() |
| 15 | + |
| 16 | +starting_time_formatted = starting_time.strftime("%H:%M:%S") |
| 17 | +print("SoloTE started at "+ starting_time_formatted) |
| 18 | + |
| 19 | +#validate PATH requirements |
| 20 | +required_software = ['samtools','bedtools'] |
| 21 | +for software in required_software: |
| 22 | + if shutil.which(software) is None: |
| 23 | + print(software+" is not in your PATH environment variable. Add it to your path before running SoloTE") |
| 24 | + exit() |
| 25 | + else: |
| 26 | + print(software+" found!") |
| 27 | + |
| 28 | + |
| 29 | +def processPerChrom(chromosome,inputfile,outdir,outbasename): |
| 30 | + count = 0 |
| 31 | + print(f'Process: {os.getpid()}') |
| 32 | + |
| 33 | + countsname = outbasename+"_countpercell_"+chromosome+".counts" |
| 34 | + if os.path.exists(countsname): |
| 35 | + print(countsname+" exists. Skipping this file") |
| 36 | + return(countsname) |
| 37 | + |
| 38 | + os.system("samtools view "+inputfile+" "+chromosome+"| awk 'BEGIN{FS=OFS=\"\t\"}{gn=\"\";gx=\"\";cb=\"\";ub=\"\";for(i=12;i<=NF;i++){if($i~ /CB:/){sub(\".*:\",\"\",$i);cb=$i};if($i~ /UB:/){sub(\".*:\",\"\",$i);ub=$i};if($i~ /GX:/){gx=$i};if($i~ /GN:/){sub(\"GN:Z:\",\"\",$i);gn=$i} }; if(cb!=\"-\" && ub != \"-\"){print gn,cb,ub}}'|sort -t'\t' -k1,1 -k2,2 -k3,3|bedtools groupby -g 1,2 -c 3 -o count_distinct > "+countsname) |
| 39 | + |
| 40 | + linenumber = subprocess.run(["wc","-l",countsname],stdout=subprocess.PIPE,text=True) |
| 41 | + if re.match("^0",linenumber.stdout): |
| 42 | + print(linenumber.stdout) |
| 43 | + else: |
| 44 | + return(countsname) |
| 45 | + |
| 46 | + |
| 47 | +inputfile = sys.argv[1] |
| 48 | +headerlines = pysam.view("-H",inputfile).split("\n") |
| 49 | +chrnames = [] |
| 50 | +for headerline in headerlines: |
| 51 | +# if "@SQ" in headerline and "chr" in headerline: |
| 52 | + if "@SQ" in headerline: |
| 53 | + split_sq = headerline.split("\t") |
| 54 | + chrname = re.sub("SN:","",split_sq[1]) |
| 55 | + chrnames.append(chrname) |
| 56 | +print(chrnames) |
| 57 | +co_line = (list(filter(lambda x:'@CO' in x,headerlines))) |
| 58 | +co_line = co_line[0] |
| 59 | +print(co_line.split("--")) |
| 60 | + |
| 61 | + |
| 62 | +outsammultnmax_value = (list(filter(lambda x:'outSAMmultNmax 1' in x,co_line.split("--")))) |
| 63 | +#print(len(outsammultnmax_value)) |
| 64 | +if len(outsammultnmax_value)!=1: |
| 65 | + print("outSAMmultNmax 1 option missing in BAM file") |
| 66 | +# exit() |
| 67 | + |
| 68 | +##samtags_from_star = (list(filter(lambda x:'outSAMattributes' in x,co_line.split("--")))) |
| 69 | +##print(samtags_from_star[0]) |
| 70 | +## |
| 71 | +##if "CB" in samtags_from_star[0] and "UB" in samtags_from_star[0]: |
| 72 | +## print("CB and UB tags present in BAM file") |
| 73 | +##else: |
| 74 | +## print("CB and UB tags not present in BAM file") |
| 75 | +## exit() |
| 76 | + |
| 77 | +cpus = sys.argv[2] |
| 78 | +outbase = sys.argv[3] |
| 79 | +TE_bed = sys.argv[4] |
| 80 | + |
| 81 | +outprefix = sys.argv[5] |
| 82 | + |
| 83 | +result_list = [] |
| 84 | + |
| 85 | +outdir = outbase+"/"+outprefix+"_SoloTE_temp" |
| 86 | + |
| 87 | +workingdir = os.getcwd() |
| 88 | +#finaldir = os.getcwd()+"/"+outbase+"_SoloTE_output" |
| 89 | +finaldir = outbase+"/"+outprefix+"_SoloTE_output" |
| 90 | +print(finaldir) |
| 91 | +print(outdir) |
| 92 | +print(outbase) |
| 93 | +print(outprefix) |
| 94 | + |
| 95 | +outbase = outprefix |
| 96 | + |
| 97 | +inputbam = os.path.abspath(inputfile) |
| 98 | +bam_without_genes=outbase+"_nogenes.bam" |
| 99 | +bam_without_genes_newtag=outbase+"_nogenes_newtag.bam" |
| 100 | +bam_without_genes_oldtag=outbase+"_nogenes_oldtag.bam" |
| 101 | +temp_bam="temp.bam" |
| 102 | +te_bam=outbase+"_nogenes_overlappingtes.bam" |
| 103 | +te_bamAsbed=outbase+"_nogenes_overlappingtes.bed" |
| 104 | +selected_TEs=outbase+"_selectedtes.bed" |
| 105 | +gene_bam=outbase+"_genes.bam" |
| 106 | +full_bam=outbase+"_full.bam" |
| 107 | +full_sortedbam=outbase+"_full_sorted.bam" |
| 108 | +final_bam=outbase+"_final.bam" |
| 109 | +annotated_te_bam=outbase+"_teannotated.bam" |
| 110 | + |
| 111 | +SoloTE_Home=os.path.dirname(__file__) |
| 112 | + |
| 113 | +Path(outdir).mkdir(parents=True, exist_ok=True) |
| 114 | +os.chdir(outdir) |
| 115 | + |
| 116 | +if os.path.exists(bam_without_genes) and os.path.exists(gene_bam): |
| 117 | + print(bam_without_genes+" and "+gene_bam+" exist in output folder. Skipping this step") |
| 118 | +else: |
| 119 | + cmd="samtools view --threads "+cpus+" -d GN -U "+bam_without_genes_oldtag+" -O BAM -o "+temp_bam+" "+inputbam |
| 120 | + print(cmd) |
| 121 | + os.system(cmd) |
| 122 | + |
| 123 | + cmd="samtools view -d GN:- -o "+bam_without_genes_newtag+" -U "+gene_bam+" -O BAM "+temp_bam |
| 124 | + print(cmd) |
| 125 | + os.system(cmd) |
| 126 | + |
| 127 | + cmd="samtools cat --threads "+cpus+" -o "+bam_without_genes+" "+bam_without_genes_oldtag+" "+bam_without_genes_newtag |
| 128 | + print(cmd) |
| 129 | + os.system(cmd) |
| 130 | + |
| 131 | +if os.path.exists(te_bam): |
| 132 | + print(te_bam+" exists in output folder. Skipping this step") |
| 133 | +else: |
| 134 | + cmd="samtools view --threads "+cpus+" -O BAM -o "+te_bam+" -L "+TE_bed+" "+bam_without_genes |
| 135 | + print(cmd) |
| 136 | + os.system(cmd) |
| 137 | + cmd="samtools index "+te_bam |
| 138 | + print(cmd) |
| 139 | + os.system(cmd) |
| 140 | + |
| 141 | +if os.path.exists(te_bamAsbed): |
| 142 | + print(te_bamAsbed+" exists in output folder. Skipping this step") |
| 143 | +else: |
| 144 | + cmd="bedtools bamtobed -i "+te_bam+" -split > "+te_bamAsbed |
| 145 | + print(cmd) |
| 146 | + os.system(cmd) |
| 147 | + |
| 148 | +if os.path.exists(selected_TEs): |
| 149 | + print(selected_TEs+" exists in output folder. Skipping this step") |
| 150 | +else: |
| 151 | + cmd="bedtools intersect -a "+TE_bed+" -b "+te_bamAsbed+" -u > "+selected_TEs |
| 152 | + print(cmd) |
| 153 | + os.system(cmd) |
| 154 | + |
| 155 | +if os.path.exists(annotated_te_bam): |
| 156 | + print(annotated_te_bam+" exists in output folder. Skipping this step") |
| 157 | +else: |
| 158 | + annotateBAMpath=SoloTE_Home+"/annotateBAM.py" |
| 159 | + cmd="python "+annotateBAMpath+" "+te_bam+" "+selected_TEs+" "+annotated_te_bam |
| 160 | + print(cmd) |
| 161 | + os.system(cmd) |
| 162 | + |
| 163 | +if os.path.exists(full_bam): |
| 164 | + print(full_bam+" exists in output folder. Skipping this step") |
| 165 | +else: |
| 166 | + cmd="samtools cat --threads "+cpus+" -o "+full_bam+" "+gene_bam+" "+annotated_te_bam |
| 167 | + print(cmd) |
| 168 | + os.system(cmd) |
| 169 | + |
| 170 | +if os.path.exists(full_sortedbam): |
| 171 | + print(full_sortedbam+" exists in output folder. Skipping this step") |
| 172 | +else: |
| 173 | + cmd="samtools sort --threads "+cpus+" -O BAM -o "+full_sortedbam+" "+full_bam |
| 174 | + print(cmd) |
| 175 | + os.system(cmd) |
| 176 | + |
| 177 | +if os.path.exists(final_bam): |
| 178 | + print(final_bam+" exists in output folder. Skipping this step") |
| 179 | +else: |
| 180 | + cmd="samtools view -U "+final_bam+" --tag CB:- -@ 8 "+full_sortedbam+" > /dev/null" |
| 181 | + print(cmd) |
| 182 | + os.system(cmd) |
| 183 | + cmd="samtools index "+final_bam |
| 184 | + print(cmd) |
| 185 | + os.system(cmd) |
| 186 | + |
| 187 | + |
| 188 | +pool = Pool(processes=int(cpus)) |
| 189 | +#Generate counts |
| 190 | +for chromosome in chrnames: |
| 191 | + pool.apply_async(processPerChrom,args=(chromosome,final_bam,outdir,outbase),callback=result_list.append) |
| 192 | + |
| 193 | +pool.close() |
| 194 | +pool.join() |
| 195 | +print(result_list) |
| 196 | + |
| 197 | +allcountsfile = outbase+"_allcounts.txt" |
| 198 | + |
| 199 | +if os.path.exists(allcountsfile): |
| 200 | + print(allcountsfile+" exists. Will be removed") |
| 201 | + os.remove(allcountsfile) |
| 202 | + |
| 203 | +for countfile in result_list: |
| 204 | + if countfile is not None: |
| 205 | + os.system("cat "+countfile+" >> "+allcountsfile) |
| 206 | + |
| 207 | +genes_tsv_filename = outbase +"_genes.tsv" |
| 208 | +barcodes_tsv_filename = outbase+"_barcodes.tsv" |
| 209 | + |
| 210 | +genes_filename1 = outbase + "_genes.txt" |
| 211 | +telocus_filename1 = outbase + "_locustes.txt" |
| 212 | +tesubf_filename1 = outbase + "_subftes.txt" |
| 213 | + |
| 214 | +os.system("grep -v \"SoloTE\" "+allcountsfile+" > "+genes_filename1) |
| 215 | +os.system("grep \"SoloTE\" "+allcountsfile+"|grep \"chr\" > "+telocus_filename1) |
| 216 | +os.system("grep \"SoloTE\" "+allcountsfile+"|grep -v \"chr\" > "+tesubf_filename1) |
| 217 | + |
| 218 | + |
| 219 | +new_outfiles = [] |
| 220 | + |
| 221 | +for filename in [genes_filename1,telocus_filename1,tesubf_filename1]: |
| 222 | + outfile = re.sub(".txt","_2.txt",filename) |
| 223 | + new_outfiles.append(outfile) |
| 224 | + os.system("sort -t'\t' -k1,1 -k2,2 -k3,3 "+filename+"|bedtools groupby -g 1,2 -c 3 -o sum > "+outfile) |
| 225 | + |
| 226 | +final_countsfile = outbase + "_allcounts_final.txt" |
| 227 | +te_threshold = 3 |
| 228 | +os.system("cat "+new_outfiles[0]+"> "+final_countsfile) |
| 229 | +os.system("cat "+new_outfiles[1]+" "+new_outfiles[2]+"|awk '($NF>="+str(te_threshold)+"){print $0}' >> "+final_countsfile) |
| 230 | + |
| 231 | +os.system("awk '{print $1}' "+final_countsfile+"|sort -u|awk 'BEGIN{FS=OFS=\"\t\"}{print $1,$1}' > "+genes_tsv_filename) |
| 232 | +os.system("awk '{print $2}' "+final_countsfile+"|sort -u > "+barcodes_tsv_filename) |
| 233 | + |
| 234 | + |
| 235 | +genenumber = subprocess.run(["wc","-l",genes_tsv_filename],stdout=subprocess.PIPE,text=True) |
| 236 | +barcodenumber = subprocess.run(["wc","-l",barcodes_tsv_filename],stdout=subprocess.PIPE,text=True) |
| 237 | +allcounts_number = subprocess.run(["wc","-l",final_countsfile],stdout=subprocess.PIPE,text=True) |
| 238 | +marketmatrix_line3 = genenumber.stdout.split(" ")[0]+" "+barcodenumber.stdout.split(" ")[0]+" "+allcounts_number.stdout.split(" ")[0] |
| 239 | + |
| 240 | +output_marketmatrix_filename = outbase+"_allcounts_matrix.mtx" |
| 241 | +os.system("echo \"%%MatrixMarket matrix coordinate integer general\n%\" > "+output_marketmatrix_filename) |
| 242 | +os.system("echo "+marketmatrix_line3+" >> "+output_marketmatrix_filename) |
| 243 | +generateMatrix_Rscript=SoloTE_Home+"/generate_mtx.R" |
| 244 | +os.system("Rscript "+generateMatrix_Rscript+" "+barcodes_tsv_filename+" "+genes_tsv_filename+" "+final_countsfile+" "+output_marketmatrix_filename) |
| 245 | + |
| 246 | +print("Creating final results directory") |
| 247 | +Path(finaldir).mkdir(parents=True, exist_ok=True) |
| 248 | +print(finaldir+" was created") |
| 249 | + |
| 250 | +os.rename(barcodes_tsv_filename,finaldir+"/barcodes.tsv") |
| 251 | +os.rename(genes_tsv_filename,finaldir+"/features.tsv") |
| 252 | +os.rename(output_marketmatrix_filename,finaldir+"/matrix.mtx") |
| 253 | + |
| 254 | +##Generate TE statistics |
| 255 | +outstats_basename = outbase+"_SoloTE.stats" |
| 256 | +outstats_filename = workingdir+"/"+outstats_basename |
| 257 | + |
| 258 | +input_te_file = re.sub(".txt","_2.txt",telocus_filename1) |
| 259 | +file_table = pd.read_table(input_te_file,header=None,sep="\t") |
| 260 | +tes_ls_n = str(len(file_table[0].unique())) |
| 261 | +tes_ls_celln = str(len(file_table[1].unique())) |
| 262 | +tes_ls_totalumicount = str(file_table[2].sum()) |
| 263 | + |
| 264 | + |
| 265 | +threshold_table = file_table[file_table[2]>=te_threshold] |
| 266 | +tes_threshold_ls_n = str(len(threshold_table[0].unique())) |
| 267 | +tes_threshold_ls_celln = str(len(threshold_table[1].unique())) |
| 268 | +tes_threshold_ls_totalumicount = str(threshold_table[2].sum()) |
| 269 | + |
| 270 | +input_te_file = re.sub(".txt","_2.txt",tesubf_filename1) |
| 271 | +file_table = pd.read_table(input_te_file,header=None,sep="\t") |
| 272 | +tes_subf_n = str(len(file_table[0].unique())) |
| 273 | +tes_subf_celln = str(len(file_table[1].unique())) |
| 274 | +tes_subf_totalumicount = str(file_table[2].sum()) |
| 275 | + |
| 276 | +threshold_table = file_table[file_table[2]>=te_threshold] |
| 277 | +tes_threshold_subf_n = str(len(threshold_table[0].unique())) |
| 278 | +tes_threshold_subf_celln = str(len(threshold_table[1].unique())) |
| 279 | +tes_threshold_subf_totalumicount = str(threshold_table[2].sum()) |
| 280 | + |
| 281 | +print("Creating "+outstats_basename+" TE statistics file") |
| 282 | +outstats_filehandle = open(outstats_filename,"w") |
| 283 | +outstats_filehandle.write("Category\tTotal TEs\tTotal cells\tTotal UMIs\n") |
| 284 | +outstats_filehandle.write("Locus-specific TEs\t"+tes_ls_n+"\t"+tes_ls_celln+"\t"+tes_ls_totalumicount+"\n") |
| 285 | +outstats_filehandle.write("Locus-specific TEs (UMIs >= "+str(te_threshold)+")\t"+tes_threshold_ls_n+"\t"+tes_threshold_ls_celln+"\t"+tes_threshold_ls_totalumicount+"\n") |
| 286 | +outstats_filehandle.write("Subfamily-specific TEs\t"+tes_subf_n+"\t"+tes_subf_celln+"\t"+tes_subf_totalumicount+"\n") |
| 287 | +outstats_filehandle.write("Subfamily-specific TEs (UMIs >= "+str(te_threshold)+")\t"+tes_threshold_subf_n+"\t"+tes_threshold_subf_celln+"\t"+tes_threshold_subf_totalumicount+"\n") |
| 288 | +outstats_filehandle.close() |
| 289 | +print ("Finished creating "+outstats_basename) |
| 290 | + |
| 291 | +print("SoloTE finished with "+inputfile) |
| 292 | + |
| 293 | +finishing_time = datetime.now() |
| 294 | +elapsed_time = finishing_time - starting_time |
| 295 | +finishing_time_formatted = finishing_time.strftime("%H:%M:%S") |
| 296 | +print("SoloTE finished at "+finishing_time_formatted) |
| 297 | +print("SoloTE total running time: "+str(elapsed_time)) |
| 298 | + |
0 commit comments