Skip to content

Commit b602395

Browse files
committed
Merge branch 'feature/sally/561/vep-hgvs-vt-automation-with-newest-changes' into feature/bencap/627/job-traceability
2 parents 07d42d7 + 7d76cca commit b602395

15 files changed

Lines changed: 1670 additions & 674 deletions

File tree

poetry.lock

Lines changed: 397 additions & 405 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/mavedb/lib/hgvs.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""HGVS nomenclature library functions for variant mapping and nomenclature conversion."""
2+
3+
import logging
4+
from typing import Sequence
5+
6+
from sqlalchemy.orm import Session
7+
8+
from mavedb.lib.exceptions import HGVSProcessingError
9+
from mavedb.models.mapped_variant import MappedVariant
10+
from mavedb.models.score_set import ScoreSet
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
def populate_mapped_hgvs_for_variants(
16+
db: Session,
17+
score_set: ScoreSet,
18+
mapped_variants: Sequence[MappedVariant],
19+
) -> bool:
20+
"""Populate HGVS nomenclature for mapped variants.
21+
22+
This function takes mapped variants and populates their HGVS expressions
23+
(genomic, transcript, and protein nomenclature) based on the variant coordinates
24+
and the score set's target gene information.
25+
26+
Args:
27+
db (Session): Database session for persisting changes.
28+
score_set (ScoreSet): The score set containing the variants.
29+
mapped_variants (Sequence[MappedVariant]): Variants to populate HGVS for.
30+
31+
Returns:
32+
bool: True if HGVS was successfully populated, False otherwise.
33+
34+
Raises:
35+
HGVSProcessingError: If critical errors occur during HGVS mapping.
36+
"""
37+
try:
38+
# Import here to avoid circular imports
39+
from mavedb.scripts.populate_mapped_hgvs import get_target_info
40+
from mavedb.lib.vrs_mapping import get_hgvs_from_variant
41+
42+
# Get target information from the score set
43+
target_is_coding, transcript_accession = get_target_info(score_set)
44+
45+
# Process each mapped variant
46+
for mapped_variant in mapped_variants:
47+
try:
48+
# Get HGVS nomenclature for this variant
49+
hgvs_data = get_hgvs_from_variant(
50+
mapped_variant=mapped_variant,
51+
transcript_accession=transcript_accession,
52+
target_is_coding=target_is_coding,
53+
)
54+
55+
if hgvs_data:
56+
mapped_variant.post_mapped = hgvs_data
57+
db.add(mapped_variant)
58+
else:
59+
logger.warning(f"Could not generate HGVS for mapped variant {mapped_variant.id}")
60+
return False
61+
62+
except Exception as e:
63+
logger.error(f"Error processing HGVS for variant {mapped_variant.id}: {str(e)}")
64+
return False
65+
66+
db.flush()
67+
return True
68+
69+
except Exception as e:
70+
logger.error(f"Error in populate_mapped_hgvs_for_variants: {str(e)}")
71+
raise HGVSProcessingError(f"Failed to populate HGVS nomenclature: {str(e)}")

src/mavedb/lib/vep.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
"""VEP (Variant Effect Predictor) library functions for functional consequence prediction."""
2+
3+
import logging
4+
from typing import Optional, Sequence
5+
6+
from mavedb.lib.utils import request_with_backoff
7+
8+
logger = logging.getLogger(__name__)
9+
10+
ENSEMBL_API_URL = "https://rest.ensembl.org"
11+
12+
# List of all possible VEP consequences, in order from most to least severe
13+
VEP_CONSEQUENCES = [
14+
"transcript_ablation",
15+
"splice_acceptor_variant",
16+
"splice_donor_variant",
17+
"stop_gained",
18+
"frameshift_variant",
19+
"stop_lost",
20+
"start_lost",
21+
"transcript_amplification",
22+
"inframe_insertion",
23+
"inframe_deletion",
24+
"missense_variant",
25+
"disruptive_inframe_insertion",
26+
"disruptive_inframe_deletion",
27+
"protein_altering_variant",
28+
"splice_region_variant",
29+
"incomplete_terminal_codon_variant",
30+
"start_retained",
31+
"stop_retained",
32+
"synonymous_variant",
33+
"coding_sequence_variant",
34+
"mature_miRNA_variant",
35+
"5_prime_UTR_premature_start_codon_gain_variant",
36+
"5_prime_UTR_variant",
37+
"3_prime_UTR_variant",
38+
"non_coding_transcript_exon_variant",
39+
"non_coding_exon_variant",
40+
"non_coding_transcript_variant",
41+
"nc_transcript_variant",
42+
"upstream_gene_variant",
43+
"downstream_gene_variant",
44+
"TFBS_ablation",
45+
"TFBS_amplification",
46+
"TF_binding_site_variant",
47+
"regulatory_region_ablation",
48+
"enhancer_ablation",
49+
"regulatory_region_amplification",
50+
"enhancer_amplification",
51+
"regulatory_region_variant",
52+
"feature_elongation",
53+
"regulatory_region",
54+
"TFBS",
55+
"feature_truncation",
56+
"exon_variant",
57+
"disruptive_inframe_deletion",
58+
"gene_variant",
59+
"variant_affecting_coding_sequence_conservation",
60+
"variant_affecting_genome_assembly_quality",
61+
"variant_of_unknown_significance",
62+
"sequence_variant",
63+
"rare_amino_acid_variant",
64+
"splice_region_variant",
65+
"downstream_gene_variant",
66+
"upstream_gene_variant",
67+
"intron_variant",
68+
"intergenic_variant",
69+
]
70+
71+
72+
def run_variant_recoder(missing_hgvs: Sequence[str]) -> dict[str, list[str]]:
73+
"""Call the Variant Recoder API and return a mapping from input HGVS strings to genomic HGVS strings.
74+
75+
Args:
76+
missing_hgvs (Sequence[str]): List of HGVS strings to recode.
77+
78+
Returns:
79+
dict[str, list[str]]: Mapping of input HGVS to list of genomic HGVS strings (hgvsg).
80+
81+
Raises:
82+
VEPProcessingError: If the API request fails.
83+
"""
84+
headers = {"Content-Type": "application/json", "Accept": "application/json"}
85+
response = request_with_backoff(
86+
method="POST",
87+
url=f"{ENSEMBL_API_URL}/variant_recoder/human",
88+
headers=headers,
89+
json={"ids": list(missing_hgvs)},
90+
)
91+
hgvs_to_genomic: dict[str, list[str]] = {}
92+
# request_with_backoff handles http errors, so no need to check response status
93+
data = response.json()
94+
for entry in data:
95+
hgvs_string = entry.get("input")
96+
if not hgvs_string:
97+
continue
98+
genomic_hgvs_list = []
99+
for variant, variant_data in entry.items():
100+
if variant == "input":
101+
continue
102+
genomic_strings = variant_data.get("hgvsg") if isinstance(variant_data, dict) else None
103+
if genomic_strings:
104+
for genomic_hgvs in genomic_strings:
105+
if genomic_hgvs.startswith("NC_"):
106+
genomic_hgvs_list.append(genomic_hgvs)
107+
if genomic_hgvs_list:
108+
hgvs_to_genomic[hgvs_string] = genomic_hgvs_list
109+
110+
return hgvs_to_genomic
111+
112+
113+
def get_functional_consequence(hgvs_strings: Sequence[str]) -> dict[str, Optional[str]]:
114+
"""Get VEP functional consequences for a batch of HGVS strings.
115+
116+
Submits HGVS strings to the Ensembl VEP API and retrieves functional consequence
117+
predictions. For any HGVS strings not found in the initial VEP response, attempts
118+
to recode them using Variant Recoder and retries with VEP.
119+
120+
Args:
121+
hgvs_strings (Sequence[str]): List of HGVS strings to process (max 200 per call).
122+
123+
Returns:
124+
dict[str, Optional[str]]: Mapping of HGVS string to functional consequence.
125+
If no consequence found, maps to None.
126+
127+
Raises:
128+
VEPProcessingError: If VEP API processing fails critically.
129+
"""
130+
if len(hgvs_strings) > 200:
131+
raise ValueError(
132+
"VEP API can process a maximum of 200 HGVS strings per request. This function does not handle batching."
133+
)
134+
135+
headers = {"Content-Type": "application/json", "Accept": "application/json"}
136+
result: dict[str, Optional[str]] = {}
137+
138+
response = request_with_backoff(
139+
method="POST",
140+
url=f"{ENSEMBL_API_URL}/vep/human/hgvs",
141+
headers=headers,
142+
json={"hgvs_notations": list(hgvs_strings)},
143+
)
144+
145+
# request_with_backoff handles http errors, so no need to check response status
146+
data = response.json()
147+
for entry in data:
148+
hgvs = entry.get("input")
149+
most_severe_consequence = entry.get("most_severe_consequence")
150+
if hgvs:
151+
result[hgvs] = most_severe_consequence
152+
153+
return result

src/mavedb/lib/workflow/definitions.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,16 @@ def annotation_pipeline_job_definitions() -> list[JobDefinition]:
8383
},
8484
"dependencies": [("warm_clingen_cache", DependencyType.SUCCESS_REQUIRED)],
8585
},
86+
{
87+
"key": "populate_vep_for_score_set",
88+
"function": "populate_vep_for_score_set",
89+
"type": JobType.MAPPED_VARIANT_ANNOTATION,
90+
"params": {
91+
"correlation_id": None, # Required param to be filled in at runtime
92+
"score_set_id": None, # Required param to be filled in at runtime
93+
},
94+
"dependencies": [("submit_score_set_mappings_to_car", DependencyType.SUCCESS_REQUIRED)],
95+
},
8696
{
8797
"key": "populate_variant_translations_for_score_set",
8898
"function": "populate_variant_translations_for_score_set",

src/mavedb/models/enums/annotation_type.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ class AnnotationType(str, Enum):
1010
CLINVAR_CONTROL = "clinvar_control"
1111
VEP_FUNCTIONAL_CONSEQUENCE = "vep_functional_consequence"
1212
LDH_SUBMISSION = "ldh_submission"
13+
VEP_FUNCTIONAL_CONSEQUENCE = "vep_functional_consequence"

0 commit comments

Comments
 (0)