|
| 1 | +import pysam |
| 2 | +import sys |
| 3 | + |
| 4 | +inputfile = sys.argv[1] |
| 5 | +samfile = pysam.AlignmentFile(inputfile,"rb") |
| 6 | + |
| 7 | +te_bed_file = sys.argv[2] |
| 8 | +te_bed_iterator = pysam.tabix_iterator(open(te_bed_file,"r"),pysam.asBed()) |
| 9 | + |
| 10 | +outfilename = sys.argv[3] |
| 11 | +outfile = pysam.AlignmentFile(outfilename, "wb",template=samfile) |
| 12 | + |
| 13 | +for te in te_bed_iterator: |
| 14 | + te_sequence=te[0] |
| 15 | + te_start=int(te[1]) |
| 16 | + te_end=int(te[2]) |
| 17 | + te_locusname=te[3] |
| 18 | + te_name = (te_locusname.split("|"))[3] |
| 19 | + sam_iterator=samfile.fetch(te_sequence,te_start,te_end) |
| 20 | + for sam_record in sam_iterator: |
| 21 | + if "N" in sam_record.cigarstring: |
| 22 | + length=sam_record.reference_end-sam_record.reference_start+1 |
| 23 | + cigar_start = sam_record.reference_start |
| 24 | + |
| 25 | + for cigartuple in sam_record.cigartuples: |
| 26 | + cigar_op = int(cigartuple[0]) |
| 27 | + cigar_op_length = cigartuple[1] |
| 28 | + cigar_end = cigar_start+cigar_op_length |
| 29 | + |
| 30 | + #if cigar op = M, check overlap with TE |
| 31 | + if cigar_op==0 : |
| 32 | + if te_end >= cigar_start and te_start <= cigar_end: |
| 33 | + sam_record.set_tag("GX",te_locusname) |
| 34 | + if sam_record.mapping_quality == 255: |
| 35 | + sam_record.set_tag("GN","SoloTE|"+te_locusname) |
| 36 | + else: |
| 37 | + sam_record.set_tag("GN","SoloTE|"+te_name) |
| 38 | + outfile.write(sam_record) |
| 39 | + |
| 40 | + cigar_start = cigar_end |
| 41 | + else: |
| 42 | + cigar_start = sam_record.reference_start |
| 43 | + cigar_end = sam_record.reference_end |
| 44 | + if te_end >= cigar_start and te_start <= cigar_end: |
| 45 | + sam_record.set_tag("GX",te_locusname) |
| 46 | + if sam_record.mapping_quality == 255: |
| 47 | + sam_record.set_tag("GN","SoloTE|"+te_locusname) |
| 48 | + else: |
| 49 | + sam_record.set_tag("GN","SoloTE|"+te_name) |
| 50 | + outfile.write(sam_record) |
| 51 | + |
| 52 | + |
0 commit comments