|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import argparse |
| 3 | +import gzip |
| 4 | +import re |
| 5 | +from bisect import bisect_left, bisect_right |
| 6 | + |
| 7 | + |
| 8 | +def get_feature(line, feature): |
| 9 | + features = re.sub('"', "", line.strip().split("\t")[8].strip()) |
| 10 | + features_dic = {x.split()[0]: x.split()[1] for x in features.split(";") if x} |
| 11 | + |
| 12 | + if feature in features_dic: |
| 13 | + return features_dic[feature] |
| 14 | + return None |
| 15 | + |
| 16 | + |
| 17 | +def main(): |
| 18 | + """ This script adds intronic features to a GTF file for single-nuclei processing. |
| 19 | + and subsequently use it with featurecounts to show intronic counts |
| 20 | + """ |
| 21 | + parser = argparse.ArgumentParser() |
| 22 | + parser.add_argument( |
| 23 | + "--input-gtf", |
| 24 | + "-i", |
| 25 | + dest="input_gtf", |
| 26 | + default=None, |
| 27 | + required=True, |
| 28 | + help="input GTF", |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + "--output-gtf", "-o", dest="output_gtf", default=None, help="output GTF" |
| 32 | + ) |
| 33 | + args = parser.parse_args() |
| 34 | + |
| 35 | + intron_cands = {} |
| 36 | + exons = {} |
| 37 | + exon_ids = {} |
| 38 | + gene_locs = {} |
| 39 | + gene_locations = {} |
| 40 | + # gather the location of each genes and exons; and exon_ids to avoid duplication |
| 41 | + with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz") else open( |
| 42 | + args.input_gtf, "r" |
| 43 | + ) as input_file: |
| 44 | + for line in input_file: |
| 45 | + if not line.startswith("#"): |
| 46 | + fields = [x.strip() for x in line.strip().split("\t")] |
| 47 | + if fields[2] == "exon": |
| 48 | + gene_id = get_feature(line.strip(), "gene_id") |
| 49 | + exon_id = get_feature(line.strip(), "exon_id") |
| 50 | + contig_id = fields[0] |
| 51 | + locpair = (int(fields[3]), int(fields[4])) |
| 52 | + if contig_id not in exons: |
| 53 | + exons[contig_id] = [] |
| 54 | + if exon_id not in exon_ids: |
| 55 | + exons[contig_id].append(locpair) |
| 56 | + exon_ids[exon_id] = True |
| 57 | + elif fields[2] == "gene": |
| 58 | + gene_id = get_feature(line.strip(), "gene_id") |
| 59 | + contig_id = fields[0] |
| 60 | + locpair = (int(fields[3]), int(fields[4]), gene_id) |
| 61 | + if gene_id is not None: |
| 62 | + if contig_id not in gene_locs: |
| 63 | + gene_locs[contig_id] = [] |
| 64 | + gene_locs[contig_id].append(locpair) |
| 65 | + |
| 66 | + gene_locations[gene_id] = locpair |
| 67 | + |
| 68 | + # sorted the gene locs by start |
| 69 | + for contig_id in gene_locs: |
| 70 | + gene_locs[contig_id].sort(key=lambda x: x[0], reverse=False) |
| 71 | + # print(contig_id, len(gene_locs[contig_id]), gene_locs[contig_id][:3], gene_locs[contig_id][-3:]) |
| 72 | + |
| 73 | + # keep sort the exons by start by contig |
| 74 | + for contig_id in exons: |
| 75 | + exons[contig_id].sort(key=lambda x: x[0], reverse=False) |
| 76 | + |
| 77 | + # compute the intron candidates for each contig |
| 78 | + # where any bp that is not an exon is an candidate intron whithout |
| 79 | + # worrying about the inclusiveness of that base pair within the range |
| 80 | + # of a gene |
| 81 | + for contig_id in exons: |
| 82 | + intron_cands[contig_id] = [] |
| 83 | + last_exon_end = 0 |
| 84 | + for exon_coor in exons[contig_id]: |
| 85 | + if exon_coor[0] > last_exon_end: |
| 86 | + pair = (last_exon_end, exon_coor[0]) |
| 87 | + intron_cands[contig_id].append(pair) |
| 88 | + |
| 89 | + last_exon_end = max(last_exon_end, exon_coor[1]) |
| 90 | + |
| 91 | + # add the remaining last |
| 92 | + pair = (last_exon_end, 30000000000) |
| 93 | + intron_cands[contig_id].append(pair) |
| 94 | + |
| 95 | + # global ordered (ascending) array of intronic start or end points |
| 96 | + introns = {} |
| 97 | + for contig_id in gene_locs: |
| 98 | + |
| 99 | + introns[contig_id] = [] |
| 100 | + intronic_points = [] |
| 101 | + for coor in intron_cands[contig_id]: |
| 102 | + intronic_points.append(coor[0]) |
| 103 | + intronic_points.append(coor[1]) |
| 104 | + |
| 105 | + for gene_loc in gene_locs[contig_id]: |
| 106 | + i = bisect_right(intronic_points, gene_loc[0], 0, len(intronic_points)) |
| 107 | + j = bisect_left(intronic_points, gene_loc[1], 0, len(intronic_points)) |
| 108 | + |
| 109 | + if i % 2 == 1: # it is a start location on i |
| 110 | + intron_start = gene_loc[0] |
| 111 | + intron_end = intronic_points[i] |
| 112 | + # introns[contig_id].append( (intron_start, intron_end, gene_loc[2]) ) |
| 113 | + |
| 114 | + for k in range(i, j, 2): |
| 115 | + introns[contig_id].append( |
| 116 | + (intronic_points[k], intronic_points[k + 1], gene_loc[2]) |
| 117 | + ) |
| 118 | + |
| 119 | + if j % 2 == 1: |
| 120 | + intron_start = intronic_points[j] |
| 121 | + intron_end = gene_loc[1] |
| 122 | + introns[contig_id].append((intron_start, intron_end, gene_loc[2])) |
| 123 | + |
| 124 | + genewise_introns = {} |
| 125 | + for contig_id in introns: |
| 126 | + genewise_introns[contig_id] = {} |
| 127 | + for intron in introns[contig_id]: |
| 128 | + if intron[2] not in genewise_introns[contig_id]: |
| 129 | + genewise_introns[contig_id][intron[2]] = [] |
| 130 | + genewise_introns[contig_id][intron[2]].append((intron[0], intron[1])) |
| 131 | + |
| 132 | + # print(contig_id, len(introns[contig_id]), introns[contig_id][:5]) |
| 133 | + intron_no = 1 |
| 134 | + with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz") else open( |
| 135 | + args.input_gtf, "r" |
| 136 | + ) as input_file: |
| 137 | + with gzip.open(args.output_gtf, "wb") if args.output_gtf.endswith( |
| 138 | + ".gz" |
| 139 | + ) else open(args.output_gtf, "w") as output_gtf: |
| 140 | + |
| 141 | + for line in input_file: |
| 142 | + if line.startswith("#"): |
| 143 | + if args.output_gtf.endswith(".gz"): |
| 144 | + output_gtf.write("{}".format(line.strip() + "\n").encode()) |
| 145 | + else: |
| 146 | + output_gtf.write(line.strip() + "\n") |
| 147 | + else: |
| 148 | + fields = [x.strip() for x in line.strip().split("\t")] |
| 149 | + if fields[2] == "exon": |
| 150 | + if args.output_gtf.endswith(".gz"): |
| 151 | + output_gtf.write("{}".format(line.strip() + "\n").encode()) |
| 152 | + else: |
| 153 | + output_gtf.write(line.strip() + "\n") |
| 154 | + |
| 155 | + elif fields[2] == "gene": |
| 156 | + if args.output_gtf.endswith(".gz"): |
| 157 | + output_gtf.write("{}".format(line.strip() + "\n").encode()) |
| 158 | + else: |
| 159 | + output_gtf.write(line.strip() + "\n") |
| 160 | + |
| 161 | + gene_id = get_feature(line.strip(), "gene_id") |
| 162 | + contig_id = fields[0] |
| 163 | + if gene_id in genewise_introns[contig_id]: |
| 164 | + for intron in genewise_introns[contig_id][gene_id]: |
| 165 | + mod_fields = fields.copy() |
| 166 | + mod_fields[2] = "intron" |
| 167 | + mod_fields[3] = str(intron[0]) |
| 168 | + mod_fields[4] = str(intron[1]) |
| 169 | + mod_fields[8] = mod_fields[ |
| 170 | + 8 |
| 171 | + ] + ' intron_id "{}"'.format(str(intron_no)) |
| 172 | + intron_no += 1 |
| 173 | + if args.output_gtf.endswith(".gz"): |
| 174 | + output_gtf.write( |
| 175 | + "{}".format( |
| 176 | + "\t".join(mod_fields) + "\n" |
| 177 | + ).encode() |
| 178 | + ) |
| 179 | + else: |
| 180 | + output_gtf.write("\t".join(mod_fields) + "\n") |
| 181 | + else: |
| 182 | + if args.output_gtf.endswith(".gz"): |
| 183 | + output_gtf.write("{}".format(line.strip() + "\n").encode()) |
| 184 | + else: |
| 185 | + output_gtf.write(line.strip() + "\n") |
| 186 | + |
| 187 | + |
| 188 | +if __name__ == "__main__": |
| 189 | + main() |
0 commit comments