|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import argparse |
| 3 | +import gzip |
| 4 | +import re |
| 5 | +import sys |
| 6 | +import pysam |
| 7 | +from bisect import bisect_left, bisect_right |
| 8 | + |
| 9 | + |
| 10 | +def get_feature(line, feature): |
| 11 | + features = re.sub('\"', '', line.strip().split('\t')[8].strip()) |
| 12 | + features_dic = {x.split()[0]:x.split()[1] for x in features.split(';') if x} |
| 13 | + |
| 14 | + if feature in features_dic: |
| 15 | + return features_dic[feature] |
| 16 | + return None |
| 17 | + |
| 18 | + |
| 19 | +def main(): |
| 20 | + """ This script subselects alignments that either crosses an intron-exon junction or |
| 21 | + the ones that are entirely contained in exons. |
| 22 | + """ |
| 23 | + parser = argparse.ArgumentParser() |
| 24 | + parser.add_argument( |
| 25 | + "--input-gtf", "-g", dest="input_gtf", required=True, help="input GTF" |
| 26 | + ) |
| 27 | + parser.add_argument( |
| 28 | + "--input-bam", "-i", dest="input_bam", required=True, help="input BAM" |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + "--output-bam", "-o", dest="output_bam", required=True, help="output BAM without intron-exon junctions" |
| 32 | + ) |
| 33 | + args = parser.parse_args() |
| 34 | + |
| 35 | + intron_cands = {} |
| 36 | + |
| 37 | + exons = {} |
| 38 | + exon_ids = {} |
| 39 | + gene_locs = {} |
| 40 | + gene_locations = {} |
| 41 | + # gather the location of each genes and exons; and exon_ids to avoid duplication |
| 42 | + with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz" |
| 43 | + ) else open(args.input_gtf, "r") as input_file: |
| 44 | + for line in input_file: |
| 45 | + if not line.startswith("#"): |
| 46 | + fields = [x.strip() for x in line.strip().split('\t')] |
| 47 | + if fields[2] == 'exon': |
| 48 | + gene_id = get_feature(line.strip(), 'gene_id') |
| 49 | + exon_id = get_feature(line.strip(), 'exon_id') |
| 50 | + contig_id = fields[0] |
| 51 | + locpair = (int(fields[3]), int(fields[4])) |
| 52 | + if contig_id not in exons: |
| 53 | + exons[contig_id] = [] |
| 54 | + if exon_id not in exon_ids: |
| 55 | + exons[contig_id].append(locpair) |
| 56 | + exon_ids[exon_id] = True |
| 57 | + elif fields[2] == 'gene': |
| 58 | + gene_id = get_feature(line.strip(), 'gene_id') |
| 59 | + contig_id = fields[0] |
| 60 | + locpair = (int(fields[3]), int(fields[4]), gene_id) |
| 61 | + if gene_id != None: |
| 62 | + if contig_id not in gene_locs: |
| 63 | + gene_locs[contig_id] = [] |
| 64 | + gene_locs[contig_id].append(locpair) |
| 65 | + |
| 66 | + gene_locations[gene_id] = locpair |
| 67 | + |
| 68 | + |
| 69 | + # sorted the gene locs by start |
| 70 | + for contig_id in gene_locs: |
| 71 | + gene_locs[contig_id].sort(key = lambda x: x[0], reverse=False) |
| 72 | + |
| 73 | + # keep sort the exons by start by contig |
| 74 | + for contig_id in exons: |
| 75 | + exons[contig_id].sort(key = lambda x: x[0], reverse=False) |
| 76 | + |
| 77 | + # compute the intron candidates for each contig |
| 78 | + # where any bp that is not an exon is an candidate intron whithout |
| 79 | + # worrying about the inclusiveness of that base pair within the range |
| 80 | + # of a gene |
| 81 | + for contig_id in exons: |
| 82 | + intron_cands[contig_id] = [] |
| 83 | + last_exon_end = 0 |
| 84 | + for exon_coor in exons[contig_id]: |
| 85 | + # add all coordinate pair that is to the right of the last exon_end |
| 86 | + if exon_coor[0] > last_exon_end: |
| 87 | + pair = (last_exon_end, exon_coor[0]) |
| 88 | + intron_cands[contig_id].append(pair) |
| 89 | + |
| 90 | + # select the right most one |
| 91 | + last_exon_end = max(last_exon_end, exon_coor[1]) |
| 92 | + |
| 93 | + #add the remaining last |
| 94 | + pair = (last_exon_end, 30000000000) |
| 95 | + intron_cands[contig_id].append(pair) |
| 96 | + |
| 97 | + # Given a list of intervals that are potentially intronic regions, the following block finds intronic regions for each gene. |
| 98 | + # For each chromosome (contig_id), for each gene_id within the chromosome, find the regions that exclude any exon intervals. |
| 99 | + # The potential intron intervals start and end points are in a global ordered (ascending) array |
| 100 | + # The odd indices are start points and the even indices are end points. If an interval crosses the gene start or end, it gets restricted to the gene body. |
| 101 | + |
| 102 | + introns = {} |
| 103 | + for contig_id in gene_locs: |
| 104 | + introns[contig_id] = [] |
| 105 | + intronic_points = [] |
| 106 | + for coor in intron_cands[contig_id]: |
| 107 | + intronic_points.append(coor[0]) |
| 108 | + intronic_points.append(coor[1]) |
| 109 | + |
| 110 | + for gene_loc in gene_locs[contig_id]: |
| 111 | + i = bisect_right(intronic_points, gene_loc[0], 0, len(intronic_points)) |
| 112 | + j = bisect_left(intronic_points, gene_loc[1], 0, len(intronic_points)) |
| 113 | + |
| 114 | + if i%2 == 1: # it is a start location on i |
| 115 | + intron_start = gene_loc[0] |
| 116 | + intron_end = intronic_points[i] |
| 117 | + |
| 118 | + for k in range(i, j, 2): |
| 119 | + introns[contig_id].append(intronic_points[k]) |
| 120 | + introns[contig_id].append(intronic_points[k+1]) |
| 121 | + |
| 122 | + if j%2 == 1: |
| 123 | + intron_start = intronic_points[j] |
| 124 | + intron_end = gene_loc[1] |
| 125 | + introns[contig_id].append(intron_start) |
| 126 | + introns[contig_id].append(intron_end) |
| 127 | + |
| 128 | + # all the introns organize by genes |
| 129 | + with pysam.AlignmentFile(args.input_bam, "rb", check_sq=False) as input_alignments: |
| 130 | + with pysam.AlignmentFile(args.output_bam, "wb", template=input_alignments) as outbam: |
| 131 | + for a in input_alignments: |
| 132 | + if a.reference_name in introns: |
| 133 | + i = bisect_left(introns[a.reference_name], a.reference_start) |
| 134 | + j = bisect_left(introns[a.reference_name], a.reference_end) |
| 135 | + # If a read crosses only one junction, it is counted towards the introns otherwise, it is counted towards the exons. |
| 136 | + # The reads could be from a premature mRNA inside the nucleus or it could be from a splices mRNA. If it is splices, the read could align to the junction crossing from one exon to another. |
| 137 | + # Since we align reads to the entire genome (introns included) these reads have a gap in them that crosses two or more junction points. |
| 138 | + if j-i!= 1: |
| 139 | + outbam.write(a) |
| 140 | + |
| 141 | + |
| 142 | +if __name__ == "__main__": |
| 143 | + main() |
0 commit comments