|
| 1 | +"""VEP (Variant Effect Predictor) library functions for functional consequence prediction.""" |
| 2 | + |
| 3 | +import logging |
| 4 | +from typing import Optional, Sequence |
| 5 | + |
| 6 | +from mavedb.lib.utils import request_with_backoff |
| 7 | + |
| 8 | +logger = logging.getLogger(__name__) |
| 9 | + |
| 10 | +ENSEMBL_API_URL = "https://rest.ensembl.org" |
| 11 | + |
| 12 | +# List of all possible VEP consequences, in order from most to least severe |
| 13 | +VEP_CONSEQUENCES = [ |
| 14 | + "transcript_ablation", |
| 15 | + "splice_acceptor_variant", |
| 16 | + "splice_donor_variant", |
| 17 | + "stop_gained", |
| 18 | + "frameshift_variant", |
| 19 | + "stop_lost", |
| 20 | + "start_lost", |
| 21 | + "transcript_amplification", |
| 22 | + "inframe_insertion", |
| 23 | + "inframe_deletion", |
| 24 | + "missense_variant", |
| 25 | + "disruptive_inframe_insertion", |
| 26 | + "disruptive_inframe_deletion", |
| 27 | + "protein_altering_variant", |
| 28 | + "splice_region_variant", |
| 29 | + "incomplete_terminal_codon_variant", |
| 30 | + "start_retained", |
| 31 | + "stop_retained", |
| 32 | + "synonymous_variant", |
| 33 | + "coding_sequence_variant", |
| 34 | + "mature_miRNA_variant", |
| 35 | + "5_prime_UTR_premature_start_codon_gain_variant", |
| 36 | + "5_prime_UTR_variant", |
| 37 | + "3_prime_UTR_variant", |
| 38 | + "non_coding_transcript_exon_variant", |
| 39 | + "non_coding_exon_variant", |
| 40 | + "non_coding_transcript_variant", |
| 41 | + "nc_transcript_variant", |
| 42 | + "upstream_gene_variant", |
| 43 | + "downstream_gene_variant", |
| 44 | + "TFBS_ablation", |
| 45 | + "TFBS_amplification", |
| 46 | + "TF_binding_site_variant", |
| 47 | + "regulatory_region_ablation", |
| 48 | + "enhancer_ablation", |
| 49 | + "regulatory_region_amplification", |
| 50 | + "enhancer_amplification", |
| 51 | + "regulatory_region_variant", |
| 52 | + "feature_elongation", |
| 53 | + "regulatory_region", |
| 54 | + "TFBS", |
| 55 | + "feature_truncation", |
| 56 | + "exon_variant", |
| 57 | + "disruptive_inframe_deletion", |
| 58 | + "gene_variant", |
| 59 | + "variant_affecting_coding_sequence_conservation", |
| 60 | + "variant_affecting_genome_assembly_quality", |
| 61 | + "variant_of_unknown_significance", |
| 62 | + "sequence_variant", |
| 63 | + "rare_amino_acid_variant", |
| 64 | + "splice_region_variant", |
| 65 | + "downstream_gene_variant", |
| 66 | + "upstream_gene_variant", |
| 67 | + "intron_variant", |
| 68 | + "intergenic_variant", |
| 69 | +] |
| 70 | + |
| 71 | + |
| 72 | +def run_variant_recoder(missing_hgvs: Sequence[str]) -> dict[str, list[str]]: |
| 73 | + """Call the Variant Recoder API and return a mapping from input HGVS strings to genomic HGVS strings. |
| 74 | +
|
| 75 | + Args: |
| 76 | + missing_hgvs (Sequence[str]): List of HGVS strings to recode. |
| 77 | +
|
| 78 | + Returns: |
| 79 | + dict[str, list[str]]: Mapping of input HGVS to list of genomic HGVS strings (hgvsg). |
| 80 | +
|
| 81 | + Raises: |
| 82 | + VEPProcessingError: If the API request fails. |
| 83 | + """ |
| 84 | + headers = {"Content-Type": "application/json", "Accept": "application/json"} |
| 85 | + response = request_with_backoff( |
| 86 | + method="POST", |
| 87 | + url=f"{ENSEMBL_API_URL}/variant_recoder/human", |
| 88 | + headers=headers, |
| 89 | + json={"ids": list(missing_hgvs)}, |
| 90 | + ) |
| 91 | + hgvs_to_genomic: dict[str, list[str]] = {} |
| 92 | + # request_with_backoff handles http errors, so no need to check response status |
| 93 | + data = response.json() |
| 94 | + for entry in data: |
| 95 | + hgvs_string = entry.get("input") |
| 96 | + if not hgvs_string: |
| 97 | + continue |
| 98 | + genomic_hgvs_list = [] |
| 99 | + for variant, variant_data in entry.items(): |
| 100 | + if variant == "input": |
| 101 | + continue |
| 102 | + genomic_strings = variant_data.get("hgvsg") if isinstance(variant_data, dict) else None |
| 103 | + if genomic_strings: |
| 104 | + for genomic_hgvs in genomic_strings: |
| 105 | + if genomic_hgvs.startswith("NC_"): |
| 106 | + genomic_hgvs_list.append(genomic_hgvs) |
| 107 | + if genomic_hgvs_list: |
| 108 | + hgvs_to_genomic[hgvs_string] = genomic_hgvs_list |
| 109 | + |
| 110 | + return hgvs_to_genomic |
| 111 | + |
| 112 | + |
| 113 | +def get_functional_consequence(hgvs_strings: Sequence[str]) -> dict[str, Optional[str]]: |
| 114 | + """Get VEP functional consequences for a batch of HGVS strings. |
| 115 | +
|
| 116 | + Submits HGVS strings to the Ensembl VEP API and retrieves functional consequence |
| 117 | + predictions. For any HGVS strings not found in the initial VEP response, attempts |
| 118 | + to recode them using Variant Recoder and retries with VEP. |
| 119 | +
|
| 120 | + Args: |
| 121 | + hgvs_strings (Sequence[str]): List of HGVS strings to process (max 200 per call). |
| 122 | +
|
| 123 | + Returns: |
| 124 | + dict[str, Optional[str]]: Mapping of HGVS string to functional consequence. |
| 125 | + If no consequence found, maps to None. |
| 126 | +
|
| 127 | + Raises: |
| 128 | + VEPProcessingError: If VEP API processing fails critically. |
| 129 | + """ |
| 130 | + if len(hgvs_strings) > 200: |
| 131 | + raise ValueError( |
| 132 | + "VEP API can process a maximum of 200 HGVS strings per request. This function does not handle batching." |
| 133 | + ) |
| 134 | + |
| 135 | + headers = {"Content-Type": "application/json", "Accept": "application/json"} |
| 136 | + result: dict[str, Optional[str]] = {} |
| 137 | + |
| 138 | + response = request_with_backoff( |
| 139 | + method="POST", |
| 140 | + url=f"{ENSEMBL_API_URL}/vep/human/hgvs", |
| 141 | + headers=headers, |
| 142 | + json={"hgvs_notations": list(hgvs_strings)}, |
| 143 | + ) |
| 144 | + |
| 145 | + # request_with_backoff handles http errors, so no need to check response status |
| 146 | + data = response.json() |
| 147 | + for entry in data: |
| 148 | + hgvs = entry.get("input") |
| 149 | + most_severe_consequence = entry.get("most_severe_consequence") |
| 150 | + if hgvs: |
| 151 | + result[hgvs] = most_severe_consequence |
| 152 | + |
| 153 | + return result |
0 commit comments