broadinstitute
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dockers/skylab/snss2-build-indices/Dockerfile‎
Lines changed: 33 additions & 0 deletions b/‎dockers/skylab/snss2-build-indices/Dockerfile‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎dockers/skylab/snss2-build-indices/add-introns-to-gtf.py‎
Lines changed: 189 additions & 0 deletions b/‎dockers/skylab/snss2-build-indices/add-introns-to-gtf.py‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎dockers/skylab/snss2-build-indices/modify_gtf_human.sh‎
Lines changed: 97 additions & 0 deletions b/‎dockers/skylab/snss2-build-indices/modify_gtf_human.sh‎
Lines changed: 97 additions & 0 deletions
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2020, Broad Institute
+Copyright (c) 2021, Broad Institute
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 
@@ -0,0 +1,33 @@
+FROM python:3.6.2
+
+LABEL maintainer="Farzaneh Khajouei <fkhajoue@broadinstitute.org>" \
+      software="subread package" \
+      version="2.0.2" \
+      description="RNA-seq high-performance read alignment, quantification and mutation discovery" \
+      website="http://subread.sourceforge.net/"
+
+# Install compiler 
+RUN apt-get update --fix-missing && apt-get install -y \
+  wget 
+
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt    
+
+# Install subread 
+WORKDIR /usr/local/ 
+
+# copy the script that removes alignments spanning intron-exon junctions
+RUN wget https://github.com/alexdobin/STAR/archive/2.7.8a.tar.gz && \
+  tar -xf 2.7.8a.tar.gz
+RUN chmod +x /usr/local/STAR-2.7.8a/bin/Linux_x86_64_static/STAR
+ENV PATH /usr/local/STAR-2.7.8a/bin/Linux_x86_64_static/:$PATH
+RUN mkdir /script
+WORKDIR /script
+
+
+COPY add-introns-to-gtf.py .
+COPY modify_gtf_human.sh .
+RUN chmod +x modify_gtf_human.sh
+COPY modify_gtf_mouse.sh .
+RUN chmod +x modify_gtf_mouse.sh
+ENV PATH /script/:$PATH
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+import argparse
+import gzip
+import re
+from bisect import bisect_left, bisect_right
+
+
+def get_feature(line, feature):
+    features = re.sub('"', "", line.strip().split("\t")[8].strip())
+    features_dic = {x.split()[0]: x.split()[1] for x in features.split(";") if x}
+
+    if feature in features_dic:
+        return features_dic[feature]
+    return None
+
+
+def main():
+    """ This script adds intronic features to a GTF file for single-nuclei processing.
+        and subsequently use it with featurecounts to show intronic counts
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-gtf",
+        "-i",
+        dest="input_gtf",
+        default=None,
+        required=True,
+        help="input GTF",
+    )
+    parser.add_argument(
+        "--output-gtf", "-o", dest="output_gtf", default=None, help="output GTF"
+    )
+    args = parser.parse_args()
+
+    intron_cands = {}
+    exons = {}
+    exon_ids = {}
+    gene_locs = {}
+    gene_locations = {}
+    # gather the location of each genes and exons; and exon_ids to avoid duplication
+    with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz") else open(
+        args.input_gtf, "r"
+    ) as input_file:
+        for line in input_file:
+            if not line.startswith("#"):
+                fields = [x.strip() for x in line.strip().split("\t")]
+                if fields[2] == "exon":
+                    gene_id = get_feature(line.strip(), "gene_id")
+                    exon_id = get_feature(line.strip(), "exon_id")
+                    contig_id = fields[0]
+                    locpair = (int(fields[3]), int(fields[4]))
+                    if contig_id not in exons:
+                        exons[contig_id] = []
+                    if exon_id not in exon_ids:
+                        exons[contig_id].append(locpair)
+                        exon_ids[exon_id] = True
+                elif fields[2] == "gene":
+                    gene_id = get_feature(line.strip(), "gene_id")
+                    contig_id = fields[0]
+                    locpair = (int(fields[3]), int(fields[4]), gene_id)
+                    if gene_id is not None:
+                        if contig_id not in gene_locs:
+                            gene_locs[contig_id] = []
+                        gene_locs[contig_id].append(locpair)
+
+                        gene_locations[gene_id] = locpair
+
+    # sorted the gene locs by start
+    for contig_id in gene_locs:
+        gene_locs[contig_id].sort(key=lambda x: x[0], reverse=False)
+        # print(contig_id, len(gene_locs[contig_id]), gene_locs[contig_id][:3], gene_locs[contig_id][-3:])
+
+    # keep sort the exons by start by contig
+    for contig_id in exons:
+        exons[contig_id].sort(key=lambda x: x[0], reverse=False)
+
+    # compute the intron candidates for each contig
+    # where any bp that is not an exon is an candidate intron whithout
+    # worrying about the inclusiveness of that base pair within the range
+    # of a gene
+    for contig_id in exons:
+        intron_cands[contig_id] = []
+        last_exon_end = 0
+        for exon_coor in exons[contig_id]:
+            if exon_coor[0] > last_exon_end:
+                pair = (last_exon_end, exon_coor[0])
+                intron_cands[contig_id].append(pair)
+
+            last_exon_end = max(last_exon_end, exon_coor[1])
+
+        # add the remaining last
+        pair = (last_exon_end, 30000000000)
+        intron_cands[contig_id].append(pair)
+
+    # global ordered (ascending) array of intronic start or end points
+    introns = {}
+    for contig_id in gene_locs:
+
+        introns[contig_id] = []
+        intronic_points = []
+        for coor in intron_cands[contig_id]:
+            intronic_points.append(coor[0])
+            intronic_points.append(coor[1])
+
+        for gene_loc in gene_locs[contig_id]:
+            i = bisect_right(intronic_points, gene_loc[0], 0, len(intronic_points))
+            j = bisect_left(intronic_points, gene_loc[1], 0, len(intronic_points))
+
+            if i % 2 == 1:  # it is a start location on i
+                intron_start = gene_loc[0]
+                intron_end = intronic_points[i]
+                # introns[contig_id].append( (intron_start, intron_end, gene_loc[2]) )
+
+            for k in range(i, j, 2):
+                introns[contig_id].append(
+                    (intronic_points[k], intronic_points[k + 1], gene_loc[2])
+                )
+
+            if j % 2 == 1:
+                intron_start = intronic_points[j]
+                intron_end = gene_loc[1]
+                introns[contig_id].append((intron_start, intron_end, gene_loc[2]))
+
+    genewise_introns = {}
+    for contig_id in introns:
+        genewise_introns[contig_id] = {}
+        for intron in introns[contig_id]:
+            if intron[2] not in genewise_introns[contig_id]:
+                genewise_introns[contig_id][intron[2]] = []
+            genewise_introns[contig_id][intron[2]].append((intron[0], intron[1]))
+
+        # print(contig_id, len(introns[contig_id]), introns[contig_id][:5])
+    intron_no = 1
+    with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz") else open(
+        args.input_gtf, "r"
+    ) as input_file:
+        with gzip.open(args.output_gtf, "wb") if args.output_gtf.endswith(
+            ".gz"
+        ) else open(args.output_gtf, "w") as output_gtf:
+
+            for line in input_file:
+                if line.startswith("#"):
+                    if args.output_gtf.endswith(".gz"):
+                        output_gtf.write("{}".format(line.strip() + "\n").encode())
+                    else:
+                        output_gtf.write(line.strip() + "\n")
+                else:
+                    fields = [x.strip() for x in line.strip().split("\t")]
+                    if fields[2] == "exon":
+                        if args.output_gtf.endswith(".gz"):
+                            output_gtf.write("{}".format(line.strip() + "\n").encode())
+                        else:
+                            output_gtf.write(line.strip() + "\n")
+
+                    elif fields[2] == "gene":
+                        if args.output_gtf.endswith(".gz"):
+                            output_gtf.write("{}".format(line.strip() + "\n").encode())
+                        else:
+                            output_gtf.write(line.strip() + "\n")
+
+                        gene_id = get_feature(line.strip(), "gene_id")
+                        contig_id = fields[0]
+                        if gene_id in genewise_introns[contig_id]:
+                            for intron in genewise_introns[contig_id][gene_id]:
+                                mod_fields = fields.copy()
+                                mod_fields[2] = "intron"
+                                mod_fields[3] = str(intron[0])
+                                mod_fields[4] = str(intron[1])
+                                mod_fields[8] = mod_fields[
+                                    8
+                                ] + ' intron_id "{}"'.format(str(intron_no))
+                                intron_no += 1
+                                if args.output_gtf.endswith(".gz"):
+                                    output_gtf.write(
+                                        "{}".format(
+                                            "\t".join(mod_fields) + "\n"
+                                        ).encode()
+                                    )
+                                else:
+                                    output_gtf.write("\t".join(mod_fields) + "\n")
+                    else:
+                        if args.output_gtf.endswith(".gz"):
+                            output_gtf.write("{}".format(line.strip() + "\n").encode())
+                        else:
+                            output_gtf.write(line.strip() + "\n")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+fasta_in=$1
+fasta_modified="modified_$(basename "$fasta_in")"
+
+# Modify sequence headers in the Ensembl FASTA to match the file
+# "GRCh38.primary_assembly.genome.fa" from GENCODE. Unplaced and unlocalized
+# sequences such as "KI270728.1" have the same names in both versions.
+#
+# Input FASTA:
+#   >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
+#
+# Output FASTA:
+#   >chr1 1
+# sed commands:
+# 1. Replace metadata after space with original contig name, as in GENCODE
+# 2. Add "chr" to names of autosomes and sex chromosomes
+# 3. Handle the mitochrondrial chromosome
+cat "$fasta_in" \
+    | sed -E 's/^>(\S+).*/>\1 \1/' \
+    | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
+    | sed -E 's/^>MT />chrM /' \
+    > "$fasta_modified"
+
+
+# Remove version suffix from transcript, gene, and exon IDs in order to match
+# previous Cell Ranger reference packages
+#
+# Input GTF:
+#     ... gene_id "ENSG00000223972.5"; ...
+# Output GTF:
+#     ... gene_id "ENSG00000223972"; gene_version "5"; ...
+gtf_in=$2
+gtf_modified="$(basename "$gtf_in").modified"
+# Pattern matches Ensembl gene, transcript, and exon IDs for human or mouse:
+ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
+cat "$gtf_in" \
+    | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \
+    | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \
+    | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \
+    > "$gtf_modified"
+
+
+# Define string patterns for GTF tags
+# NOTES:
+# - Since GENCODE release 31/M22 (Ensembl 97), the "lincRNA" and "antisense"
+#   biotypes are part of a more generic "lncRNA" biotype.
+# - These filters are relevant only to GTF files from GENCODE. The GTFs from
+#   Ensembl release 98 have the following differences:
+#   - The names "gene_biotype" and "transcript_biotype" are used instead of
+#     "gene_type" and "transcript_type".
+#   - Readthrough transcripts are present but are not marked with the
+#     "readthrough_transcript" tag.
+#   - Only the X chromosome versions of genes in the pseudoautosomal regions
+#     are present, so there is no "PAR" tag.
+BIOTYPE_PATTERN=\
+"(protein_coding|lncRNA|\
+IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\
+IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\
+TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\
+TR_V_pseudogene|TR_J_pseudogene)"
+GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\""
+TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\""
+READTHROUGH_PATTERN="tag \"readthrough_transcript\""
+PAR_PATTERN="tag \"PAR\""
+
+
+# Construct the gene ID allowlist. We filter the list of all transcripts
+# based on these criteria:
+#   - allowable gene_type (biotype)
+#   - allowable transcript_type (biotype)
+#   - no "PAR" tag (only present for Y chromosome PAR)
+#   - no "readthrough_transcript" tag
+# We then collect the list of gene IDs that have at least one associated
+# transcript passing the filters.
+cat "$gtf_modified" \
+    | awk '$3 == "transcript"' \
+    | grep -E "$GENE_PATTERN" \
+    | grep -E "$TX_PATTERN" \
+    | grep -Ev "$READTHROUGH_PATTERN" \
+    | grep -Ev "$PAR_PATTERN" \
+    | sed -E 's/.*(gene_id "[^"]+").*/\1/' \
+    | sort \
+    | uniq \
+    > gene_allowlist
+
+
+# Filter the GTF file based on the gene allowlist
+gtf_filtered="modified_$(basename "$gtf_in")"
+
+# Copy header lines beginning with "#"
+grep -E "^#" "$gtf_modified" > "$gtf_filtered"
+# Filter to the gene allowlist
+grep -Ff gene_allowlist "$gtf_modified" \
+    >> "$gtf_filtered"
+
+