Add GTF preprocessing script; update GTF to GENCODE v47 (#783)

epiercehoffman · web-flow · commit 2871327daa17 · 2025-04-01T16:21:48.000-04:00
diff --git a/inputs/values/resources_hg38.json b/inputs/values/resources_hg38.json
@@ -43,7 +43,7 @@
   "primary_contigs_fai" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/contig.fai",
   "primary_contigs_list" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs.list",
   "contigs_header": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38_contigs_header.vcf",
-  "protein_coding_gtf" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/MANE.GRCh38.v1.2.ensembl_genomic.gtf",
+  "protein_coding_gtf" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/gencode.v47.basic.protein_coding.canonical.gtf",
   "reference_build" : "hg38",
   "reference_bwa_alt": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt",
   "reference_bwa_amb": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb",
diff --git a/scripts/inputs/preprocess_gtf.py b/scripts/inputs/preprocess_gtf.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+"""
+Preprocess GENCODE basic GTF to extract canonical protein-coding transcripts for functional consequence annotation.
+"""
+
+import argparse
+import gzip
+
+
+CHROM_FIELD = 0
+ELEMENT_FIELD = 2
+ATTRIBUTES_FIELD = 8
+TRANSCRIPT_TYPES = {"protein_coding", "nonsense_mediated_decay"}
+CANONICAL = {"MANE_Plus_Clinical", "MANE_Select", "Ensembl_canonical"}
+
+
+# Flexibly open .gz or uncompressed file to read
+def _open(filename):
+    if filename.endswith(".gz"):
+        return gzip.open(filename, 'rt')
+    else:
+        return open(filename, 'r')
+
+
+# Extract transcript type and canonical status
+def parse_attributes(field):
+    # format: key1 "value1"; key2 "value2";
+    # keys may be repeated so cannot convert directly to dictionary
+    attributes_list = [tuple(x.replace('"', '').split(' ')) for x in field.rstrip(";").split("; ")]
+    protein = False
+    canonical = False
+    for key, val in attributes_list:
+        if key == "tag" and val in CANONICAL:
+            canonical = True
+        elif key == "transcript_type" and val in TRANSCRIPT_TYPES:
+            protein = True
+    return protein, canonical
+
+
+def process(gtf, outfile):
+    with _open(gtf) as inp, open(outfile, 'w') as out:
+        gene_line = ""
+        for line in inp:
+            if line.startswith("#"):
+                continue
+            fields = line.rstrip('\n').split('\t')
+
+            # Drop mitochondria
+            if fields[CHROM_FIELD] == 'chrM':
+                continue
+
+            # Store gene line to print if transcript is eligible
+            if fields[ELEMENT_FIELD] == "gene":
+                gene_line = line
+                continue
+
+            # Select protein-coding and canonical transcripts only
+            protein, canonical = parse_attributes(fields[ATTRIBUTES_FIELD])
+            if protein and canonical:
+                out.write(gene_line + line)
+                gene_line = ""  # only print gene line before first transcript line
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('gtf', help="Input GTF from GENCODE")
+    parser.add_argument('outfile', help="Output filename")
+    args = parser.parse_args()
+
+    process(args.gtf, args.outfile)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/website/docs/resources.md b/website/docs/resources.md
@@ -84,7 +84,7 @@ Text file of primary contig names.
 Plain text VCF header section of primary contig sequences.
 
 #### protein_coding_gtf
-Protein coding sequence definitions for functional annotation in [General Transfer Format](https://www.ensembl.org/info/website/upload/gff.html).
+Protein-coding sequence definitions for functional annotation in [General Transfer Format](https://www.ensembl.org/info/website/upload/gff.html). This GTF was created by subsetting the [GENCODE](https://www.gencodegenes.org/human/releases.html) GRCh38 basic gene annotation GTF with the script `scripts/inputs/preprocess_gtf.py`. Transcripts annotated as either Ensembl canonical or MANE Select Plus Clinical, and as either protein-coding or from nonsense-mediated decay, were retained. The GENCODE version is included in the filename.
 
 #### reference_dict
 Reference FASTA dictionary file (`*.dict`). See [this article](https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) for more information.