Skip to content

Commit 59e19e5

Browse files
authored
Merge pull request #307 from broadinstitute/staging
Staging -> Master
2 parents 7e97dd5 + 2d988ff commit 59e19e5

94 files changed

Lines changed: 1441 additions & 938 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
BSD 3-Clause License
22

3-
Copyright (c) 2020, Broad Institute
3+
Copyright (c) 2021, Broad Institute
44
All rights reserved.
55

66
Redistribution and use in source and binary forms, with or without
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
FROM python:3.6.2
2+
3+
LABEL maintainer="Farzaneh Khajouei <fkhajoue@broadinstitute.org>" \
4+
software="subread package" \
5+
version="2.0.2" \
6+
description="RNA-seq high-performance read alignment, quantification and mutation discovery" \
7+
website="http://subread.sourceforge.net/"
8+
9+
# Install compiler
10+
RUN apt-get update --fix-missing && apt-get install -y \
11+
wget
12+
13+
COPY requirements.txt .
14+
RUN pip3 install -r requirements.txt
15+
16+
# Install subread
17+
WORKDIR /usr/local/
18+
19+
# copy the script that removes alignments spanning intron-exon junctions
20+
RUN wget https://github.com/alexdobin/STAR/archive/2.7.8a.tar.gz && \
21+
tar -xf 2.7.8a.tar.gz
22+
RUN chmod +x /usr/local/STAR-2.7.8a/bin/Linux_x86_64_static/STAR
23+
ENV PATH /usr/local/STAR-2.7.8a/bin/Linux_x86_64_static/:$PATH
24+
RUN mkdir /script
25+
WORKDIR /script
26+
27+
28+
COPY add-introns-to-gtf.py .
29+
COPY modify_gtf_human.sh .
30+
RUN chmod +x modify_gtf_human.sh
31+
COPY modify_gtf_mouse.sh .
32+
RUN chmod +x modify_gtf_mouse.sh
33+
ENV PATH /script/:$PATH
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import gzip
4+
import re
5+
from bisect import bisect_left, bisect_right
6+
7+
8+
def get_feature(line, feature):
9+
features = re.sub('"', "", line.strip().split("\t")[8].strip())
10+
features_dic = {x.split()[0]: x.split()[1] for x in features.split(";") if x}
11+
12+
if feature in features_dic:
13+
return features_dic[feature]
14+
return None
15+
16+
17+
def main():
18+
""" This script adds intronic features to a GTF file for single-nuclei processing.
19+
and subsequently use it with featurecounts to show intronic counts
20+
"""
21+
parser = argparse.ArgumentParser()
22+
parser.add_argument(
23+
"--input-gtf",
24+
"-i",
25+
dest="input_gtf",
26+
default=None,
27+
required=True,
28+
help="input GTF",
29+
)
30+
parser.add_argument(
31+
"--output-gtf", "-o", dest="output_gtf", default=None, help="output GTF"
32+
)
33+
args = parser.parse_args()
34+
35+
intron_cands = {}
36+
exons = {}
37+
exon_ids = {}
38+
gene_locs = {}
39+
gene_locations = {}
40+
# gather the location of each genes and exons; and exon_ids to avoid duplication
41+
with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz") else open(
42+
args.input_gtf, "r"
43+
) as input_file:
44+
for line in input_file:
45+
if not line.startswith("#"):
46+
fields = [x.strip() for x in line.strip().split("\t")]
47+
if fields[2] == "exon":
48+
gene_id = get_feature(line.strip(), "gene_id")
49+
exon_id = get_feature(line.strip(), "exon_id")
50+
contig_id = fields[0]
51+
locpair = (int(fields[3]), int(fields[4]))
52+
if contig_id not in exons:
53+
exons[contig_id] = []
54+
if exon_id not in exon_ids:
55+
exons[contig_id].append(locpair)
56+
exon_ids[exon_id] = True
57+
elif fields[2] == "gene":
58+
gene_id = get_feature(line.strip(), "gene_id")
59+
contig_id = fields[0]
60+
locpair = (int(fields[3]), int(fields[4]), gene_id)
61+
if gene_id is not None:
62+
if contig_id not in gene_locs:
63+
gene_locs[contig_id] = []
64+
gene_locs[contig_id].append(locpair)
65+
66+
gene_locations[gene_id] = locpair
67+
68+
# sorted the gene locs by start
69+
for contig_id in gene_locs:
70+
gene_locs[contig_id].sort(key=lambda x: x[0], reverse=False)
71+
# print(contig_id, len(gene_locs[contig_id]), gene_locs[contig_id][:3], gene_locs[contig_id][-3:])
72+
73+
# keep sort the exons by start by contig
74+
for contig_id in exons:
75+
exons[contig_id].sort(key=lambda x: x[0], reverse=False)
76+
77+
# compute the intron candidates for each contig
78+
# where any bp that is not an exon is an candidate intron whithout
79+
# worrying about the inclusiveness of that base pair within the range
80+
# of a gene
81+
for contig_id in exons:
82+
intron_cands[contig_id] = []
83+
last_exon_end = 0
84+
for exon_coor in exons[contig_id]:
85+
if exon_coor[0] > last_exon_end:
86+
pair = (last_exon_end, exon_coor[0])
87+
intron_cands[contig_id].append(pair)
88+
89+
last_exon_end = max(last_exon_end, exon_coor[1])
90+
91+
# add the remaining last
92+
pair = (last_exon_end, 30000000000)
93+
intron_cands[contig_id].append(pair)
94+
95+
# global ordered (ascending) array of intronic start or end points
96+
introns = {}
97+
for contig_id in gene_locs:
98+
99+
introns[contig_id] = []
100+
intronic_points = []
101+
for coor in intron_cands[contig_id]:
102+
intronic_points.append(coor[0])
103+
intronic_points.append(coor[1])
104+
105+
for gene_loc in gene_locs[contig_id]:
106+
i = bisect_right(intronic_points, gene_loc[0], 0, len(intronic_points))
107+
j = bisect_left(intronic_points, gene_loc[1], 0, len(intronic_points))
108+
109+
if i % 2 == 1: # it is a start location on i
110+
intron_start = gene_loc[0]
111+
intron_end = intronic_points[i]
112+
# introns[contig_id].append( (intron_start, intron_end, gene_loc[2]) )
113+
114+
for k in range(i, j, 2):
115+
introns[contig_id].append(
116+
(intronic_points[k], intronic_points[k + 1], gene_loc[2])
117+
)
118+
119+
if j % 2 == 1:
120+
intron_start = intronic_points[j]
121+
intron_end = gene_loc[1]
122+
introns[contig_id].append((intron_start, intron_end, gene_loc[2]))
123+
124+
genewise_introns = {}
125+
for contig_id in introns:
126+
genewise_introns[contig_id] = {}
127+
for intron in introns[contig_id]:
128+
if intron[2] not in genewise_introns[contig_id]:
129+
genewise_introns[contig_id][intron[2]] = []
130+
genewise_introns[contig_id][intron[2]].append((intron[0], intron[1]))
131+
132+
# print(contig_id, len(introns[contig_id]), introns[contig_id][:5])
133+
intron_no = 1
134+
with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz") else open(
135+
args.input_gtf, "r"
136+
) as input_file:
137+
with gzip.open(args.output_gtf, "wb") if args.output_gtf.endswith(
138+
".gz"
139+
) else open(args.output_gtf, "w") as output_gtf:
140+
141+
for line in input_file:
142+
if line.startswith("#"):
143+
if args.output_gtf.endswith(".gz"):
144+
output_gtf.write("{}".format(line.strip() + "\n").encode())
145+
else:
146+
output_gtf.write(line.strip() + "\n")
147+
else:
148+
fields = [x.strip() for x in line.strip().split("\t")]
149+
if fields[2] == "exon":
150+
if args.output_gtf.endswith(".gz"):
151+
output_gtf.write("{}".format(line.strip() + "\n").encode())
152+
else:
153+
output_gtf.write(line.strip() + "\n")
154+
155+
elif fields[2] == "gene":
156+
if args.output_gtf.endswith(".gz"):
157+
output_gtf.write("{}".format(line.strip() + "\n").encode())
158+
else:
159+
output_gtf.write(line.strip() + "\n")
160+
161+
gene_id = get_feature(line.strip(), "gene_id")
162+
contig_id = fields[0]
163+
if gene_id in genewise_introns[contig_id]:
164+
for intron in genewise_introns[contig_id][gene_id]:
165+
mod_fields = fields.copy()
166+
mod_fields[2] = "intron"
167+
mod_fields[3] = str(intron[0])
168+
mod_fields[4] = str(intron[1])
169+
mod_fields[8] = mod_fields[
170+
8
171+
] + ' intron_id "{}"'.format(str(intron_no))
172+
intron_no += 1
173+
if args.output_gtf.endswith(".gz"):
174+
output_gtf.write(
175+
"{}".format(
176+
"\t".join(mod_fields) + "\n"
177+
).encode()
178+
)
179+
else:
180+
output_gtf.write("\t".join(mod_fields) + "\n")
181+
else:
182+
if args.output_gtf.endswith(".gz"):
183+
output_gtf.write("{}".format(line.strip() + "\n").encode())
184+
else:
185+
output_gtf.write(line.strip() + "\n")
186+
187+
188+
if __name__ == "__main__":
189+
main()
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/bin/bash
2+
3+
fasta_in=$1
4+
fasta_modified="modified_$(basename "$fasta_in")"
5+
6+
# Modify sequence headers in the Ensembl FASTA to match the file
7+
# "GRCh38.primary_assembly.genome.fa" from GENCODE. Unplaced and unlocalized
8+
# sequences such as "KI270728.1" have the same names in both versions.
9+
#
10+
# Input FASTA:
11+
# >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
12+
#
13+
# Output FASTA:
14+
# >chr1 1
15+
# sed commands:
16+
# 1. Replace metadata after space with original contig name, as in GENCODE
17+
# 2. Add "chr" to names of autosomes and sex chromosomes
18+
# 3. Handle the mitochrondrial chromosome
19+
cat "$fasta_in" \
20+
| sed -E 's/^>(\S+).*/>\1 \1/' \
21+
| sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
22+
| sed -E 's/^>MT />chrM /' \
23+
> "$fasta_modified"
24+
25+
26+
# Remove version suffix from transcript, gene, and exon IDs in order to match
27+
# previous Cell Ranger reference packages
28+
#
29+
# Input GTF:
30+
# ... gene_id "ENSG00000223972.5"; ...
31+
# Output GTF:
32+
# ... gene_id "ENSG00000223972"; gene_version "5"; ...
33+
gtf_in=$2
34+
gtf_modified="$(basename "$gtf_in").modified"
35+
# Pattern matches Ensembl gene, transcript, and exon IDs for human or mouse:
36+
ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
37+
cat "$gtf_in" \
38+
| sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \
39+
| sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \
40+
| sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \
41+
> "$gtf_modified"
42+
43+
44+
# Define string patterns for GTF tags
45+
# NOTES:
46+
# - Since GENCODE release 31/M22 (Ensembl 97), the "lincRNA" and "antisense"
47+
# biotypes are part of a more generic "lncRNA" biotype.
48+
# - These filters are relevant only to GTF files from GENCODE. The GTFs from
49+
# Ensembl release 98 have the following differences:
50+
# - The names "gene_biotype" and "transcript_biotype" are used instead of
51+
# "gene_type" and "transcript_type".
52+
# - Readthrough transcripts are present but are not marked with the
53+
# "readthrough_transcript" tag.
54+
# - Only the X chromosome versions of genes in the pseudoautosomal regions
55+
# are present, so there is no "PAR" tag.
56+
BIOTYPE_PATTERN=\
57+
"(protein_coding|lncRNA|\
58+
IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\
59+
IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\
60+
TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\
61+
TR_V_pseudogene|TR_J_pseudogene)"
62+
GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\""
63+
TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\""
64+
READTHROUGH_PATTERN="tag \"readthrough_transcript\""
65+
PAR_PATTERN="tag \"PAR\""
66+
67+
68+
# Construct the gene ID allowlist. We filter the list of all transcripts
69+
# based on these criteria:
70+
# - allowable gene_type (biotype)
71+
# - allowable transcript_type (biotype)
72+
# - no "PAR" tag (only present for Y chromosome PAR)
73+
# - no "readthrough_transcript" tag
74+
# We then collect the list of gene IDs that have at least one associated
75+
# transcript passing the filters.
76+
cat "$gtf_modified" \
77+
| awk '$3 == "transcript"' \
78+
| grep -E "$GENE_PATTERN" \
79+
| grep -E "$TX_PATTERN" \
80+
| grep -Ev "$READTHROUGH_PATTERN" \
81+
| grep -Ev "$PAR_PATTERN" \
82+
| sed -E 's/.*(gene_id "[^"]+").*/\1/' \
83+
| sort \
84+
| uniq \
85+
> gene_allowlist
86+
87+
88+
# Filter the GTF file based on the gene allowlist
89+
gtf_filtered="modified_$(basename "$gtf_in")"
90+
91+
# Copy header lines beginning with "#"
92+
grep -E "^#" "$gtf_modified" > "$gtf_filtered"
93+
# Filter to the gene allowlist
94+
grep -Ff gene_allowlist "$gtf_modified" \
95+
>> "$gtf_filtered"
96+
97+

0 commit comments

Comments
 (0)