Skip to content

Commit f7bb841

Browse files
committed
feat(pipelines): update v4 genes to annotate low coverage transcripts
1 parent 1fd65df commit f7bb841

File tree

2 files changed

+32
-4
lines changed

2 files changed

+32
-4
lines changed

data-pipeline/src/data_pipeline/data_types/gene.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,24 @@ def collect_transcript_exons(transcript_exons):
164164
return exons
165165

166166

167+
def annotate_gene_models_with_low_coverage_flag(genes_path, low_coverage_tsv_path):
168+
low_coverage_flag_name = "v4_low_coverage_trancript"
169+
170+
genes_ht = hl.read_table(genes_path)
171+
tsv_ht = hl.import_table(low_coverage_tsv_path)
172+
tsv_ht = tsv_ht.key_by("transcript_id")
173+
174+
genes_ht = genes_ht.annotate(
175+
flags=hl.if_else(
176+
hl.is_defined(tsv_ht[genes_ht.canonical_transcript_id]),
177+
hl.or_else(genes_ht.flags, hl.empty_set(hl.tstr)).add(low_coverage_flag_name),
178+
genes_ht.flags,
179+
)
180+
)
181+
182+
return genes_ht
183+
184+
167185
###############################################
168186
# Main #
169187
###############################################

data-pipeline/src/data_pipeline/pipelines/genes.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
prepare_heterozygous_variant_cooccurrence_counts,
3333
prepare_homozygous_variant_cooccurrence_counts,
3434
)
35-
from data_pipeline.data_types.gene import reject_par_y_genes
35+
from data_pipeline.data_types.gene import reject_par_y_genes, annotate_gene_models_with_low_coverage_flag
3636

3737
from data_pipeline.datasets.gnomad_v4.gnomad_v4_constraint import (
3838
prepare_gnomad_v4_constraint,
@@ -491,12 +491,22 @@ def annotate_v4_with_constraint(genes_path, constraint_path):
491491
},
492492
)
493493

494+
pipeline.add_task(
495+
"annotate_grch38_genes_step_7",
496+
annotate_gene_models_with_low_coverage_flag,
497+
f"/{genes_subdir}/genes_grch38_annotated_7.ht",
498+
{
499+
"genes_path": pipeline.get_task("annotate_grch38_genes_step_6"),
500+
"low_coverage_tsv_path": "gs://gnomad-v4-data-pipeline/inputs/v4.1.1/gnomad.v4.1.low_coverage_transcripts.tsv",
501+
},
502+
)
503+
494504
pipeline.add_task(
495505
"remove_grch38_genes_constraint_for_release",
496506
remove_gnomad_v4_constraint,
497-
f"/{genes_subdir}/genes_grch38_annotate_5_removed_constraint",
507+
f"/{genes_subdir}/genes_grch38_annotate_7_removed_constraint",
498508
{
499-
"genes_path": pipeline.get_task("annotate_grch38_genes_step_5"),
509+
"genes_path": pipeline.get_task("annotate_grch38_genes_step_7"),
500510
},
501511
)
502512

@@ -563,7 +573,7 @@ def annotate_v4_with_constraint(genes_path, constraint_path):
563573
pipeline.set_outputs(
564574
{
565575
"genes_grch37": "annotate_grch37_genes_step_5",
566-
"genes_grch38": "annotate_grch38_genes_step_6",
576+
"genes_grch38": "annotate_grch38_genes_step_7",
567577
"base_transcripts_grch37": "extract_grch37_transcripts",
568578
"base_transcripts_grch38": "extract_grch38_transcripts",
569579
"transcripts_grch37": "annotate_grch37_transcripts",

0 commit comments

Comments
 (0)