Skip to content

Commit a48eb4b

Browse files
Fix with BAM having GN:- tag for not genic reads
1 parent a6430ac commit a48eb4b

File tree

3 files changed

+74
-0
lines changed

3 files changed

+74
-0
lines changed

SoloTE_1.02/annotateBAM.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pysam
2+
import sys
3+
4+
inputfile = sys.argv[1]
5+
samfile = pysam.AlignmentFile(inputfile,"rb")
6+
7+
te_bed_file = sys.argv[2]
8+
te_bed_iterator = pysam.tabix_iterator(open(te_bed_file,"r"),pysam.asBed())
9+
10+
outfilename = sys.argv[3]
11+
outfile = pysam.AlignmentFile(outfilename, "wb",template=samfile)
12+
13+
for te in te_bed_iterator:
14+
te_sequence=te[0]
15+
te_start=int(te[1])
16+
te_end=int(te[2])
17+
te_locusname=te[3]
18+
te_name = (te_locusname.split("|"))[3]
19+
sam_iterator=samfile.fetch(te_sequence,te_start,te_end)
20+
for sam_record in sam_iterator:
21+
if "N" in sam_record.cigarstring:
22+
length=sam_record.reference_end-sam_record.reference_start+1
23+
cigar_start = sam_record.reference_start
24+
25+
for cigartuple in sam_record.cigartuples:
26+
cigar_op = int(cigartuple[0])
27+
cigar_op_length = cigartuple[1]
28+
cigar_end = cigar_start+cigar_op_length
29+
30+
#if cigar op = M, check overlap with TE
31+
if cigar_op==0 :
32+
if te_end >= cigar_start and te_start <= cigar_end:
33+
sam_record.set_tag("GX",te_locusname)
34+
if sam_record.mapping_quality == 255:
35+
sam_record.set_tag("GN","SoloTE|"+te_locusname)
36+
else:
37+
sam_record.set_tag("GN","SoloTE|"+te_name)
38+
outfile.write(sam_record)
39+
40+
cigar_start = cigar_end
41+
else:
42+
cigar_start = sam_record.reference_start
43+
cigar_end = sam_record.reference_end
44+
if te_end >= cigar_start and te_start <= cigar_end:
45+
sam_record.set_tag("GX",te_locusname)
46+
if sam_record.mapping_quality == 255:
47+
sam_record.set_tag("GN","SoloTE|"+te_locusname)
48+
else:
49+
sam_record.set_tag("GN","SoloTE|"+te_name)
50+
outfile.write(sam_record)
51+
52+
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
RepeatMaskerOutfile=$1
4+
outputfile=$2
5+
6+
rm $outputfile
7+
8+
awk 'BEGIN{OFS="\t"}(NR>=4){strand="+"; if($9=="C"){ strand="-" }; split($11,arr,"/");id=arr[2]":"arr[1];id=$5"|"$6"|"$7"|"$10":"id"|"strand; if(arr[1] ~ /DNA|LINE|LTR|RC|SINE/) { print $5,$6,$7,id,$1,strand } }' $RepeatMaskerOutfile > $outputfile

SoloTE_1.02/generate_mtx.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
args <- commandArgs(trailingOnly=TRUE)
2+
3+
barcodes_filename <- args[1]
4+
genes_filename <- args[2]
5+
counts_filename <- args[3]
6+
7+
outname <- args[4]
8+
9+
barcodes <- read.delim(barcodes_filename,header=FALSE)
10+
genes <- read.delim(genes_filename,header=FALSE)
11+
counts <- read.delim(counts_filename,header=FALSE,sep="\t")
12+
counts$matchgene <- match(counts$V1,genes$V1)
13+
counts$matchbarcode <- match(counts$V2,barcodes$V1)
14+
write.table(file=outname,counts[,c(4,5,3)],sep=" ",quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)

0 commit comments

Comments
 (0)