Skip to content

Commit 2871327

Browse files
Add GTF preprocessing script; update GTF to GENCODE v47 (#783)
1 parent 63ce732 commit 2871327

File tree

3 files changed

+77
-2
lines changed

3 files changed

+77
-2
lines changed

inputs/values/resources_hg38.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
"primary_contigs_fai" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/contig.fai",
4444
"primary_contigs_list" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs.list",
4545
"contigs_header": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38_contigs_header.vcf",
46-
"protein_coding_gtf" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/MANE.GRCh38.v1.2.ensembl_genomic.gtf",
46+
"protein_coding_gtf" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/gencode.v47.basic.protein_coding.canonical.gtf",
4747
"reference_build" : "hg38",
4848
"reference_bwa_alt": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt",
4949
"reference_bwa_amb": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb",

scripts/inputs/preprocess_gtf.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Preprocess GENCODE basic GTF to extract canonical protein-coding transcripts for functional consequence annotation.
5+
"""
6+
7+
import argparse
8+
import gzip
9+
10+
11+
CHROM_FIELD = 0
12+
ELEMENT_FIELD = 2
13+
ATTRIBUTES_FIELD = 8
14+
TRANSCRIPT_TYPES = {"protein_coding", "nonsense_mediated_decay"}
15+
CANONICAL = {"MANE_Plus_Clinical", "MANE_Select", "Ensembl_canonical"}
16+
17+
18+
# Flexibly open .gz or uncompressed file to read
19+
def _open(filename):
20+
if filename.endswith(".gz"):
21+
return gzip.open(filename, 'rt')
22+
else:
23+
return open(filename, 'r')
24+
25+
26+
# Extract transcript type and canonical status
27+
def parse_attributes(field):
28+
# format: key1 "value1"; key2 "value2";
29+
# keys may be repeated so cannot convert directly to dictionary
30+
attributes_list = [tuple(x.replace('"', '').split(' ')) for x in field.rstrip(";").split("; ")]
31+
protein = False
32+
canonical = False
33+
for key, val in attributes_list:
34+
if key == "tag" and val in CANONICAL:
35+
canonical = True
36+
elif key == "transcript_type" and val in TRANSCRIPT_TYPES:
37+
protein = True
38+
return protein, canonical
39+
40+
41+
def process(gtf, outfile):
42+
with _open(gtf) as inp, open(outfile, 'w') as out:
43+
gene_line = ""
44+
for line in inp:
45+
if line.startswith("#"):
46+
continue
47+
fields = line.rstrip('\n').split('\t')
48+
49+
# Drop mitochondria
50+
if fields[CHROM_FIELD] == 'chrM':
51+
continue
52+
53+
# Store gene line to print if transcript is eligible
54+
if fields[ELEMENT_FIELD] == "gene":
55+
gene_line = line
56+
continue
57+
58+
# Select protein-coding and canonical transcripts only
59+
protein, canonical = parse_attributes(fields[ATTRIBUTES_FIELD])
60+
if protein and canonical:
61+
out.write(gene_line + line)
62+
gene_line = "" # only print gene line before first transcript line
63+
64+
65+
def main():
66+
parser = argparse.ArgumentParser()
67+
parser.add_argument('gtf', help="Input GTF from GENCODE")
68+
parser.add_argument('outfile', help="Output filename")
69+
args = parser.parse_args()
70+
71+
process(args.gtf, args.outfile)
72+
73+
74+
if __name__ == '__main__':
75+
main()

website/docs/resources.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ Text file of primary contig names.
8484
Plain text VCF header section of primary contig sequences.
8585

8686
#### protein_coding_gtf
87-
Protein coding sequence definitions for functional annotation in [General Transfer Format](https://www.ensembl.org/info/website/upload/gff.html).
87+
Protein-coding sequence definitions for functional annotation in [General Transfer Format](https://www.ensembl.org/info/website/upload/gff.html). This GTF was created by subsetting the [GENCODE](https://www.gencodegenes.org/human/releases.html) GRCh38 basic gene annotation GTF with the script `scripts/inputs/preprocess_gtf.py`. Transcripts annotated as either Ensembl canonical or MANE Select Plus Clinical, and as either protein-coding or from nonsense-mediated decay, were retained. The GENCODE version is included in the filename.
8888

8989
#### reference_dict
9090
Reference FASTA dictionary file (`*.dict`). See [this article](https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) for more information.

0 commit comments

Comments
 (0)