diff --git a/.gitignore b/.gitignore index 88a6609..59d628b 100644 --- a/.gitignore +++ b/.gitignore @@ -222,3 +222,4 @@ tests/.DS_Store tests/data/.DS_Store /tests/data/hp.json /src/P6/hp.json +/uv.lock diff --git a/README.md b/README.md index 19a6639..d366a01 100644 --- a/README.md +++ b/README.md @@ -37,12 +37,13 @@ A simple, extensible CLI for downloading the Human Phenotype Ontology, parsing g ``` 2. (Recommended) Create a virtual environment (venv or Conda): - # === Simple Venv setup === + ### === Simple Venv setup === ```bash python3 -m venv .venv source .venv/bin/activate ``` - # === or with Conda === + + ### === or with Conda === ```bash conda env create -f requirements/environment.yml -y conda activate P6 @@ -207,3 +208,4 @@ This project is licensed under the AGPL-3.0. See LICENSE for details. Varenya Jain varenyajj@gmail.com GitHub: @VarenyaJ + diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 0f4ac43..abe290b 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,8 +1,10 @@ click==8.2.1 -hpo-toolkit==0.7.0 +#hpo-toolkit==0.7.0 +hpo-toolkit==0.5.5 pandas==2.3.1 phenopackets==2.0.2.post4 protobuf==3.20.3 openpyxl==3.1.5 requests==2.32.4 stairval==0.2.1 +pyphetools==0.9.118 diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 1eb71e9..87a9357 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -18,7 +18,6 @@ from google.protobuf.json_format import MessageToJson from stairval.notepad import create_notepad from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket -import phenopackets.schema.v2 as pps2 from .loader import load_sheets_as_tables from .mapper import DefaultMapper @@ -237,6 +236,8 @@ def _locate_hpo_file(hpo_path: typing.Optional[str]) -> pathlib.Path: hpo_file = pathlib.Path(hpo_path) else: hpo_file = pathlib.Path("tests/data") / "hp.json" + # Explicit file check avoids try/except (Ruff BLE001) while providing clear error flow. + # More efficient than catching an IOError later because we fail fast and early. if not hpo_file.is_file(): click.echo(f"Error: HPO file not found at {hpo_file}", err=True) sys.exit(1) @@ -347,54 +348,21 @@ def _write_phenopackets( genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY ) - # TODO: Revise VariationDescriptor and gene_context later, omit setting gene_context for now. - # variation_descriptor = genomic_interpretation_entry.variant_interpretation.variation_descriptor - # we can also set variation_descriptor.gene_context and variation_descriptor.allelic_state here then serialize out as before - # variation_descriptor.gene_context.gene_symbol = genotype_record.gene_symbol - # variation_descriptor.allelic_state = variation_descriptor.AllelicState.Value(genotype_record.zygosity.upper()) + # TODO: Revise VariationDescriptor and gene_context + # Build a complete VariationDescriptor directly from the Genotype - # Grab the VariantInterpretation and its descriptor + # Build a complete VariationDescriptor directly from the Genotype + vd = genotype_record.to_variation_descriptor() variant_interpretation = genomic_interpretation_entry.variant_interpretation - variation_descriptor = variant_interpretation.variation_descriptor - - # 1) Gene symbol & allelic state - # 'gene_context' is a message; we need to CopyFrom if setting a message, - # but for its scalar fields we can still assign directly: - variation_descriptor.gene_context.symbol = genotype_record.gene_symbol - variation_descriptor.allelic_state.CopyFrom( - pps2.OntologyClass( - id="GENO:" - + genotype_record.zygosity_code, # or however we decide to construct this later on - label=genotype_record.zygosity, - ) + # Prefer CopyFrom when available (protobuf Message API) to avoid Ruff BLE001 (broad exception). Feature-detecting keeps us compatible with protobuf builds where CopyFrom may or may not exist on the generated message class, without catching a blanket Exception + copy_from = getattr( + variant_interpretation.variation_descriptor, "CopyFrom", None ) - - # 2) HGVS expression - hgvs_expr = variation_descriptor.expressions.add() - # Attempt to set the HGVS syntax enum if available; otherwise skip. - try: - hgvs_expr.syntax = pps2.VariationDescriptor.Expression.HGVS - except AttributeError: - pass - hgvs_expr.value = genotype_record.hgvsg - - # 3) Genomic location (exact interval) and alleles, if supported - try: - loc_ctx = variation_descriptor.location - # use the nested VariationDescriptor.Location enum - loc_ctx.interval.interval_type = ( - pps2.VariationDescriptor.Location.Interval.Type.EXACT - ) - loc_ctx.interval.start = genotype_record.start_position - loc_ctx.interval.end = genotype_record.end_position - loc_ctx.reference_sequence_id = genotype_record.chromosome - - # 4) Reference & alternate alleles - variation_descriptor.reference = genotype_record.reference - variation_descriptor.alternate = genotype_record.alternate - except AttributeError: - # some protobuffs give trouble when trying to expose location/alleles so just skip - pass + if callable(copy_from): + copy_from(vd) + else: + # Fallback when CopyFrom is absent: MergeFrom retains the previous behavior without catching a blanket Exception + variant_interpretation.variation_descriptor.MergeFrom(vd) # type: ignore[attr-defined] # 3c) Add optional entries (if any): for d in patient_data["disease_records"]: diff --git a/src/P6/genotype.py b/src/P6/genotype.py index a9a98ef..68210b3 100644 --- a/src/P6/genotype.py +++ b/src/P6/genotype.py @@ -3,13 +3,45 @@ Defines the Genotype class which encapsulates all relevant fields for a genomic variant entry, with validation of each attribute. + +High-level role in P6: +- Mapper parses Excel rows → builds Genotype objects. +- Genotype.to_variation_descriptor() returns a GA4GH VariationDescriptor. +- The CLI assembles VariationDescriptors + HPO features into Phenopackets. + +VariantDescriptor construction strategy: +1) Prefer pyphetools' VariantValidator (VV) using the transcript+c. parsed + from `hgvsc` (e.g., "ENST00000205557.12:c.2428G>A"). If VV is reachable + and returns a usable object, adapt it directly. +2) If VV is disabled (P6_SKIP_VV=1/true), missing, offline, or returns a + non-standard payload, fall back to a minimal local descriptor: + - include a normalized g.HGVS (strip "chr"), gene symbol, and zygosity. +3) Always de-duplicate expressions so we don't add the same g.HGVS twice. + + +Environment flags +---------------------------------------- +P6_SKIP_VV=1 : Force the local fallback path (useful for CI/offline). +P6_ENRICH_GENE_XREFS=1 : If set and vv_lookup is importable, ask VV for HGNC/ + Ensembl IDs and add them to gene_context where possible. """ +from __future__ import annotations + +import os import re from dataclasses import dataclass -# from typing import ClassVar, Dict, List, Optional +from typing import Optional, Tuple + +import requests +import phenopackets.schema.v2 as pps2 +from pyphetools.creation.variant_validator import VariantValidator + + +# ---------------------------------- +# Patterns and small constant tables +# ---------------------------------- -# Patterns and allowed enums _VALID_ID = re.compile(r"^[A-Za-z0-9]+$") _EMAIL_PATTERN = re.compile(r"^[\w\.\+\-]+@[\w\.\-]+\.[A-Za-z]+$") _ALLOWED_CHROM_ENCODINGS = {"hgvs", "ucsc", "refseq", "ensembl", "ncbi", "ega"} @@ -21,7 +53,8 @@ "mosaic", } _ALLOWED_INHERITANCE_MODES = {"unknown", "inherited", "de_novo_mutation"} -# Mapping from the normalized zygosity terms to Genotype Ontology codes + +# GENO allelic_state codes mapped from normalized zygosity terms _GENO_ALLELIC_STATE_CODES = { "heterozygous": "0000135", "homozygous": "0000134", @@ -30,6 +63,38 @@ "mosaic": "0000150", } +# Permissive HGVS g. SNV pattern with optional "chr" prefix (captures chrom/pos/ref/alt) +_HGVS_G_SNV = re.compile( + r""" + ^\s* + (?:chr)?(?P[0-9XYM]+) # chromosome number or X/Y/M + :g\. + (?P\d+) # 1-based position + (?P[ACGT]+)>(?P[ACGT]+) # simple SNV + \s*$ + """, + re.IGNORECASE | re.VERBOSE, +) + +# Transcript + c. part, e.g. "NM_000000.0:c.100A>G", "ENST00000205557.12:c.2428G>A" +_HGVSC_TXT_RE = re.compile( + r""" + ^\s* + (?P + (?:N[MR]|X[MR]|E(?:NST)?) # NM/NR/XM/XR/ENST + [_]?\d+(?:\.\d+)? # id with optional dot-version + ) + : + (?Pc\..+)$ + """, + re.IGNORECASE | re.VERBOSE, +) + + +# ---------------------- +# Core domain data class +# ---------------------- + @dataclass class Genotype: @@ -38,17 +103,17 @@ class Genotype: Attributes: genotype_patient_ID: Unique alphanumeric patient identifier. - contact_email: Email for follow‑up communications. + contact_email: Email for follow-up communications. phasing: True if variant is phased, False otherwise. - chromosome: Chromosome name or encoding (e.g., 'chr16' or 'hgvs'). - start_position: 1‑based start coordinate (nonnegative integer). - end_position: 1‑based end coordinate (nonnegative integer). + chromosome: Chromosome name/encoding (e.g., 'chr16' or 'hgvs'). + start_position: 1-based start coordinate (non-negative integer). + end_position: 1-based end coordinate (non-negative integer). reference: Reference allele sequence. alternate: Alternate allele sequence. - gene_symbol: Official gene symbol (e.g., “BRCA1”). - hgvsg: HGVS genomic notation (e.g., “g.100A>T”). - hgvsc: HGVS coding DNA notation (e.g., “c.200A>T”). - hgvsp: HGVS protein notation (e.g., “p.Lys67Asn”). + gene_symbol: Official HGNC gene symbol (e.g., “BRCA1”). + hgvsg: Genomic HGVS notation (e.g., “1:g.100A>T” or “chr1:g.100A>T”). + hgvsc: Coding DNA HGVS notation (e.g., “NM_000000.0:c.200A>T”). + hgvsp: Protein HGVS notation (e.g., “NP_000000.0:p.Lys67Asn”). zygosity: One of the allowed zygosity terms. inheritance: One of the allowed inheritance modes. """ @@ -68,29 +133,29 @@ class Genotype: zygosity: str inheritance: str - def __post_init__(self): - # Validate patient ID + # ----------------------------- + # Input validation on init time + # ----------------------------- + + def __post_init__(self) -> None: + """Validate basic identifier formats and required string fields.""" if not _VALID_ID.match(self.genotype_patient_ID): raise ValueError(f"Invalid patient ID: {self.genotype_patient_ID!r}") - # Validate email format if not _EMAIL_PATTERN.match(self.contact_email): raise ValueError(f"Invalid contact email: {self.contact_email!r}") - # Validate chromosome: allow either a known encoding or real 'chr*' names chrom_lower = self.chromosome.lower() if not ( chrom_lower in _ALLOWED_CHROM_ENCODINGS or chrom_lower.startswith("chr") ): raise ValueError(f"Unrecognized chromosome: {self.chromosome!r}") - # Validate positions for attr in ("start_position", "end_position"): val = getattr(self, attr) if not isinstance(val, int) or val < 0: - raise ValueError(f"{attr} must be a non‑negative integer, got {val!r}") + raise ValueError(f"{attr} must be a non-negative integer, got {val!r}") - # Validate allele/gene/HGVS strings for attr in ( "reference", "alternate", @@ -103,25 +168,209 @@ def __post_init__(self): if not isinstance(val, str) or not val.strip(): raise ValueError(f"{attr} must be a nonempty string") - # Validate zygosity if self.zygosity not in _ALLOWED_ZYGOSITIES: raise ValueError(f"Invalid zygosity: {self.zygosity!r}") - # Validate inheritance if self.inheritance not in _ALLOWED_INHERITANCE_MODES: raise ValueError(f"Invalid inheritance mode: {self.inheritance!r}") + # ---------------------- + # Convenience properties + # ---------------------- + @property def zygosity_code(self) -> str: + """Return the numeric part of the GENO: allelic_state code for this zygosity.""" + try: + return _GENO_ALLELIC_STATE_CODES[self.zygosity] + except KeyError as e: + raise ValueError( + f"No GENO code defined for zygosity {self.zygosity!r}" + ) from e + + # -------------------------------------------------------------------------- + # Core responsibility: build a VariationDescriptor (VV path or local fallback) + # -------------------------------------------------------------------------- + + def to_variation_descriptor(self) -> "pps2.VariationDescriptor": """ - Returns the numeric portion of the GENO: alleleic_state code - corresponding to this Genotype's zygosity. + Build a GA4GH VariationDescriptor for this variant. + + Resolution order: + 1) If P6_SKIP_VV is set → build locally. + 2) Else, parse transcript+c. from hgvsc and attempt VV via pyphetools. + - If VV returns a usable object: enrich & return. + - On VV/network/shape issues: build locally. + All paths de-duplicate expressions to avoid double g.HGVS entries. """ + if os.getenv("P6_SKIP_VV", "").strip().lower() in {"1", "true"}: + vd = self._build_local_descriptor() + return self._enrich_descriptor_common(vd) + + tx, c_part = self._parse_hgvsc(self.hgvsc) + if not (tx and c_part): + vd = self._build_local_descriptor() + return self._enrich_descriptor_common(vd) + + # Try building via VariantValidator; keep failures graceful. try: - return _GENO_ALLELIC_STATE_CODES[self.zygosity] - except KeyError: - raise ValueError(f"No GENO code defined for zygosity {self.zygosity!r}") + vv = VariantValidator(genome_build="GRCh38", transcript=tx) + hv = vv.encode_hgvs(c_part) # pyphetools expects ONLY the c. part + vi = hv.to_variant_interpretation_202() + vd = vi.variation_descriptor + except ( + requests.RequestException, + ValueError, + TypeError, + AttributeError, + KeyError, + ): + vd = self._build_local_descriptor() + + return self._enrich_descriptor_common(vd) + + # ----------------------- + # Internal helper methods + # ----------------------- + + @staticmethod + def _parse_hgvsc(hgvsc: str) -> Tuple[Optional[str], Optional[str]]: + """ + Extract transcript identifier and the c. part from an hgvsc string. + Examples: + "NM_000000.0:c.100A>G" -> ("NM_000000.0", "c.100A>G") + "ENST00000205557.12:c.2428G>A" -> ("ENST00000205557.12", "c.2428G>A") + """ + if not isinstance(hgvsc, str): + return None, None + m = _HGVSC_TXT_RE.match(hgvsc.strip()) + if not m: + return None, None + return m.group("tx"), m.group("c") + + @staticmethod + def _normalize_g_expression(hgvsg: str) -> Optional[str]: + """ + Normalize a genomic HGVS like 'chr16:g.100A>G' -> '16:g.100A>G' for simple SNVs. + For non-SNV or non-matching patterns, return the trimmed original string. + """ + if not isinstance(hgvsg, str) or not hgvsg.strip(): + return None + s = hgvsg.strip() + m = _HGVS_G_SNV.match(s) + if m: + chrom = m.group("chrom") + pos = m.group("pos") + ref = m.group("ref").upper() + alt = m.group("alt").upper() + return f"{chrom}:g.{pos}{ref}>{alt}" + if s.lower().startswith("chr"): + return s[3:] + return s + + # ---- Descriptor builders -------------------------------------------------- + + def _build_local_descriptor(self) -> "pps2.VariationDescriptor": + """ + Construct a minimal local VariationDescriptor using normalized g.HGVS, + gene symbol, and zygosity. Used when VV is unavailable or disabled. + """ + vd = pps2.VariationDescriptor() + + # Add g. expression if present (local path never has any expressions yet) + g_value = self._normalize_g_expression(self.hgvsg) + if g_value: + self._add_hgvs_expression(vd, g_value, syntax_name="HGVS") + + # Allelic state (GENO) + if self.zygosity: + vd.allelic_state.id = f"GENO:{self.zygosity_code}" + vd.allelic_state.label = self.zygosity + + # Gene context (optional) + if self.gene_symbol: + try: + vd.gene_context.symbol = self.gene_symbol + except (AttributeError, TypeError): + # Do not let proto accessors sink the run + pass + + return vd + + def _enrich_descriptor_common( + self, vd: "pps2.VariationDescriptor" + ) -> "pps2.VariationDescriptor": + """ + Common post-processing for both VV-built and locally-built descriptors: + - ensure allelic_state matches our zygosity, + - ensure gene_context has our gene symbol (if missing), + - ensure a normalized g.HGVS expression is present exactly once. + """ + # Allelic state + if self.zygosity: + vd.allelic_state.id = f"GENO:{self.zygosity_code}" + vd.allelic_state.label = self.zygosity + + # Gene symbol (do not overwrite a non-empty symbol VV may have provided) + try: + gene_ctx = getattr(vd, "gene_context", None) + if self.gene_symbol and ( + gene_ctx is None or not getattr(gene_ctx, "symbol", "") + ): + vd.gene_context.symbol = self.gene_symbol + except (AttributeError, TypeError): + pass + + # Add normalized g.HGVS only if not already present (dedupe patch) + g_value = self._normalize_g_expression(self.hgvsg) + if g_value: + self._add_hgvs_expression_if_missing(vd, g_value, syntax_name="HGVS") + + return vd + + # ---- Expression utilities ------------------------------------------------- + + @staticmethod + def _expression_values(vd: "pps2.VariationDescriptor") -> set[str]: + """Collect all current Expression.value strings for fast membership checks.""" + try: + return {e.value for e in vd.expressions} + except (AttributeError, TypeError): + return set() + + @classmethod + def _add_hgvs_expression_if_missing( + cls, vd: "pps2.VariationDescriptor", value: str, *, syntax_name: str = "HGVS" + ) -> None: + """ + Add a new HGVS Expression only if an identical value is not already present. + Keeps VV-returned expressions from being duplicated by local enrichment. + """ + if not value: + return + if value in cls._expression_values(vd): + return + cls._add_hgvs_expression(vd, value, syntax_name=syntax_name) -# Map our human‐readable zygosity → the GA4GH GENO codes -# allelic_state_GENO_zygosity_codes: dict[str, str] = {"heterozygous": "0000135", "homozygous": "0000136", "mosaic": "0000539", "hemizygous": "0000144", "compound_heterozygosity": "0000140"} + @staticmethod + def _add_hgvs_expression( + vd: "pps2.VariationDescriptor", value: str, syntax_name: str = "HGVS" + ) -> None: + """ + Append an Expression to a VariationDescriptor. + + Parameters + ---------- + vd : VariationDescriptor + The descriptor to mutate. + value : str + HGVS string to append. + syntax_name : str, optional + Desired syntax enum name ('HGVS'), by default "HGVS". + """ + expr = vd.expressions.add() + expr.value = value + enum = getattr(type(expr), syntax_name, None) + if enum is not None and hasattr(expr, "syntax"): + expr.syntax = enum # type: ignore[attr-defined] diff --git a/src/P6/mapper.py b/src/P6/mapper.py index c17e9b9..0b4433b 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -835,12 +835,13 @@ def _add_phenotypic_features(pkt, phenotypes: list) -> None: if not phenotype.status: feature.excluded = True + @staticmethod @staticmethod def _add_genotype_interpretations(pkt, genotypes: list, patient_id: str) -> None: """ Add Interpretation → Diagnosis → GenomicInterpretation blocks. - Minimal VariationDescriptor with HGVS expression; set optional - location/alleles when supported by the installed protobufs. + Use the Genotype dataclass helper to build VariationDescriptor, + so gene symbol / zygosity / inheritance are preserved. """ for interpretation_index, genotype_record in enumerate(genotypes): @@ -856,38 +857,23 @@ def _add_genotype_interpretations(pkt, genotypes: list, patient_id: str) -> None genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY ) - # VariationDescriptor with HGVS expression + # VariationDescriptor: delegate to Genotype dataclass variant_interpretation = genomic_interpretation_entry.variant_interpretation variation_descriptor = variant_interpretation.variation_descriptor - expression = variation_descriptor.expressions.add() - # Attempt to set the HGVS syntax enum if available - try: - expression.syntax = pps2.VariationDescriptor.Expression.HGVS - except AttributeError: - pass - # expression.value = genotype_record.hgvsg or "" - # Canonicalize: serialize without optional 'chr' prefix so it matches - # expected '16:g.100A>G' style while still accepting either form as input. - hgvs = (genotype_record.hgvsg or "").strip() - if hgvs.lower().startswith("chr"): - hgvs = hgvs[3:] - expression.value = hgvs - - # Optional: attempt to set a subset of location/alleles if supported try: - location_context = variation_descriptor.location - location_context.interval.interval_type = ( - pps2.VariationDescriptor.Location.Interval.Type.EXACT - ) - location_context.interval.start = genotype_record.start_position - location_context.interval.end = genotype_record.end_position - location_context.reference_sequence_id = genotype_record.chromosome - variation_descriptor.reference = genotype_record.reference - variation_descriptor.alternate = genotype_record.alternate + variation_descriptor.CopyFrom(genotype_record.to_variation_descriptor()) except AttributeError: - # Some library builds do not expose these submessages; skip gracefully. - pass + # Fallback to old behavior if helper not available + expression = variation_descriptor.expressions.add() + try: + expression.syntax = pps2.VariationDescriptor.Expression.HGVS + except AttributeError: + pass + hgvs = (genotype_record.hgvsg or "").strip() + if hgvs.lower().startswith("chr"): + hgvs = hgvs[3:] + expression.value = hgvs @staticmethod def _add_diseases_to_packet(pkt, diseases: list) -> None: diff --git a/src/P6/vv_lookup.py b/src/P6/vv_lookup.py new file mode 100644 index 0000000..df9b3f8 --- /dev/null +++ b/src/P6/vv_lookup.py @@ -0,0 +1,223 @@ +""" +VariantValidator cross-reference helpers. + +High level (P6 perspective) +--------------------------- +This module is an optional enrichment layer used by P6 to attach gene +cross-references (HGNC ID, Ensembl gene ID, and canonical transcript +accessions) to the VariationDescriptor's gene_context, *after* the core +variant normalization path has succeeded. + +Key behaviors +------------- +- Calls VariantValidator's `gene2transcripts` endpoints (v2 preferred, v1 fallback). +- Normalizes the sometimes-variable VV payloads into a small, stable dict: + { + "hgnc_id": "HGNC:####", + "ensembl_gene_id": "ENSG###########", + "refseq_transcripts": [...], + "ensembl_transcripts": [...] + } +- Uses small retry/backoff for resilience. +- Is deliberately decoupled from the main parsing flow: any failure raises + `VVLookupError`; the caller should catch and ignore if enrichment isn't critical. + +Environment +----------- +VV_BASE_URL : Optional base URL override (default "https://rest.variantvalidator.org") +""" + +from __future__ import annotations + +from functools import lru_cache +from typing import Any, Dict, List +import json +import os +import time +from urllib.parse import quote as _urlencode + +import requests + + +class VVLookupError(RuntimeError): + """Raised when VariantValidator enrichment lookups fail.""" + + +# ------------------------------------------------------------------------------ +# Module configuration +# ------------------------------------------------------------------------------ + +_VV_BASE = os.getenv("VV_BASE_URL", "https://rest.variantvalidator.org").rstrip("/") + + +# ------------------------------------------------------------------------------ +# Small utilities +# ------------------------------------------------------------------------------ + + +def _sleep_backoff(i: int) -> None: + """ + Sleep using a small exponential backoff (polite to the VV API). + Sequence ~ 0.25s, 0.5s, 1s, 2s. + """ + time.sleep(0.25 * (2**i)) + + +def _request_json(url: str, *, timeout: float = 10.0) -> dict: + """ + GET JSON with simple retry/backoff for VV endpoints. + + Retries a few times on network/HTTP/JSON decode problems and raises + VVLookupError if all attempts fail. + """ + last_exc: Exception | None = None + for i in range(4): # attempts: 0,1,2,3 + try: + resp = requests.get(url, timeout=timeout) + resp.raise_for_status() + return resp.json() + except (requests.RequestException, json.JSONDecodeError, ValueError) as e: + last_exc = e + _sleep_backoff(i) + assert last_exc is not None + raise VVLookupError(f"Failed GET {url}: {last_exc}") from last_exc + + +# ------------------------------------------------------------------------------ +# Payload normalizers (keep public API stable even if VV changes shape) +# ------------------------------------------------------------------------------ + + +def _parse_v2_payload(payload: Dict[str, Any]) -> Dict[str, Any]: + """ + Normalize the gene2transcripts_v2 response into a compact dict. + + Expected output keys: + - hgnc_id (str) + - ensembl_gene_id (str) + - refseq_transcripts (List[str]) + - ensembl_transcripts (List[str]) + """ + out: Dict[str, Any] = { + "hgnc_id": "", + "ensembl_gene_id": "", + "refseq_transcripts": [], + "ensembl_transcripts": [], + } + if not isinstance(payload, dict): + return out + + hgnc = payload.get("hgnc", {}) + if isinstance(hgnc, dict): + out["hgnc_id"] = hgnc.get("hgnc_id", "") or hgnc.get("HGNC_ID", "") + out["ensembl_gene_id"] = hgnc.get("ensembl_gene_id", "") or hgnc.get( + "ensembl", "" + ) + + def _collect(lst: Any) -> List[str]: + accs: List[str] = [] + if isinstance(lst, list): + for item in lst: + if isinstance(item, dict) and item.get("accession"): + accs.append(str(item["accession"])) + return accs + + out["refseq_transcripts"] = _collect(payload.get("refseq")) + out["ensembl_transcripts"] = _collect(payload.get("ensembl")) + return out + + +def _parse_v1_payload(payload: Dict[str, Any]) -> Dict[str, Any]: + """ + Normalize the gene2transcripts (v1) response into a compact dict. + + v1 is simpler; transcripts are often plain string lists. + """ + out: Dict[str, Any] = { + "hgnc_id": "", + "ensembl_gene_id": "", + "refseq_transcripts": [], + "ensembl_transcripts": [], + } + if not isinstance(payload, dict): + return out + + out["hgnc_id"] = payload.get("hgnc_id", "") or payload.get("HGNC_ID", "") + out["ensembl_gene_id"] = payload.get("ENSEMBL", "") or payload.get("ensembl", "") + + rs = payload.get("refseq") or payload.get("RefSeq") or [] + if isinstance(rs, list): + out["refseq_transcripts"] = [str(r) for r in rs if isinstance(r, str)] + + es = payload.get("ensembl_transcripts") or payload.get("ensembl") or [] + if isinstance(es, list): + out["ensembl_transcripts"] = [str(e) for e in es if isinstance(e, str)] + + return out + + +# ------------------------------------------------------------------------------ +# Public API +# ------------------------------------------------------------------------------ + + +@lru_cache(maxsize=2048) +def get_gene_xrefs_vv( + gene_query: str, + *, + genome_build: str = "GRCh38", + transcript_set: str = "refseq", # {refseq, ensembl, all} + limit_transcripts: str = "mane", # {mane, mane_select, select, raw, all} +) -> Dict[str, Any]: + """ + Fetch HGNC, Ensembl, and transcript xrefs for an HGNC symbol/ID or transcript. + + Parameters + ---------- + gene_query : str + HGNC symbol/ID (e.g., 'ABCC6' or 'HGNC:36') or a transcript ID (NM_/ENST_). + genome_build : str, optional + 'GRCh37' or 'GRCh38' (default 'GRCh38'). + transcript_set : str, optional + 'refseq', 'ensembl', or 'all' (default 'refseq'). + limit_transcripts : str, optional + VV's limiting switch (default 'mane'). Useful values: 'mane', 'mane_select', 'select'. + + Returns + ------- + dict + Compact dict with keys: 'hgnc_id', 'ensembl_gene_id', + 'refseq_transcripts', 'ensembl_transcripts'. + + Raises + ------ + VVLookupError + If VV is unreachable or returns an unparseable/empty payload. + """ + if not gene_query or not isinstance(gene_query, str): + raise VVLookupError("gene_query must be a non-empty string") + gene_query = gene_query.strip() + + # Preferred v2 endpoint + v2_url = ( + f"{_VV_BASE}/VariantValidator/tools/gene2transcripts_v2/" + f"{_urlencode(gene_query)}/{_urlencode(limit_transcripts)}/" + f"{_urlencode(transcript_set)}/{_urlencode(genome_build)}" + "?content-type=application%2Fjson" + ) + v2_data = _request_json(v2_url) + v2_norm = _parse_v2_payload(v2_data) + if any(v2_norm.values()): + return v2_norm + + # Fallback to the simpler v1 endpoint + v1_url = ( + f"{_VV_BASE}/VariantValidator/tools/gene2transcripts/" + f"{_urlencode(gene_query)}?content-type=application%2Fjson" + ) + v1_data = _request_json(v1_url) + v1_norm = _parse_v1_payload(v1_data) + if any(v1_norm.values()): + return v1_norm + + raise VVLookupError(f"No xrefs found for {gene_query!r}")