diff --git a/.gitignore b/.gitignore
index 88a6609..59d628b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -222,3 +222,4 @@ tests/.DS_Store
 tests/data/.DS_Store
 /tests/data/hp.json
 /src/P6/hp.json
+/uv.lock
diff --git a/README.md b/README.md
index 19a6639..d366a01 100644
--- a/README.md
+++ b/README.md
@@ -37,12 +37,13 @@ A simple, extensible CLI for downloading the Human Phenotype Ontology, parsing g
     ```
 
 2.  (Recommended) Create a virtual environment (venv or Conda):
-    # === Simple Venv setup ===
+    ### === Simple Venv setup ===
     ```bash
     python3 -m venv .venv
     source .venv/bin/activate
     ```
-    # === or with Conda ===
+
+    ### === or with Conda ===
     ```bash
     conda env create -f requirements/environment.yml -y
     conda activate P6
@@ -207,3 +208,4 @@ This project is licensed under the AGPL-3.0. See LICENSE for details.
 Varenya Jain
 varenyajj@gmail.com
 GitHub: @VarenyaJ
+
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 0f4ac43..abe290b 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,8 +1,10 @@
 click==8.2.1
-hpo-toolkit==0.7.0
+#hpo-toolkit==0.7.0
+hpo-toolkit==0.5.5
 pandas==2.3.1
 phenopackets==2.0.2.post4
 protobuf==3.20.3
 openpyxl==3.1.5
 requests==2.32.4
 stairval==0.2.1
+pyphetools==0.9.118
diff --git a/src/P6/__main__.py b/src/P6/__main__.py
index 1eb71e9..87a9357 100644
--- a/src/P6/__main__.py
+++ b/src/P6/__main__.py
@@ -18,7 +18,6 @@
 from google.protobuf.json_format import MessageToJson
 from stairval.notepad import create_notepad
 from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket
-import phenopackets.schema.v2 as pps2
 
 from .loader import load_sheets_as_tables
 from .mapper import DefaultMapper
@@ -237,6 +236,8 @@ def _locate_hpo_file(hpo_path: typing.Optional[str]) -> pathlib.Path:
         hpo_file = pathlib.Path(hpo_path)
     else:
         hpo_file = pathlib.Path("tests/data") / "hp.json"
+    # Explicit file check avoids try/except (Ruff BLE001) while providing clear error flow.
+    # More efficient than catching an IOError later because we fail fast and early.
     if not hpo_file.is_file():
         click.echo(f"Error: HPO file not found at {hpo_file}", err=True)
         sys.exit(1)
@@ -347,54 +348,21 @@ def _write_phenopackets(
                 genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY
             )
 
-            # TODO: Revise VariationDescriptor and gene_context later, omit setting gene_context for now.
-            # variation_descriptor = genomic_interpretation_entry.variant_interpretation.variation_descriptor
-            # we can also set variation_descriptor.gene_context and variation_descriptor.allelic_state here then serialize out as before
-            # variation_descriptor.gene_context.gene_symbol = genotype_record.gene_symbol
-            # variation_descriptor.allelic_state = variation_descriptor.AllelicState.Value(genotype_record.zygosity.upper())
+            # TODO: Revise VariationDescriptor and gene_context
+            # Build a complete VariationDescriptor directly from the Genotype
 
-            # Grab the VariantInterpretation and its descriptor
+            # Build a complete VariationDescriptor directly from the Genotype
+            vd = genotype_record.to_variation_descriptor()
             variant_interpretation = genomic_interpretation_entry.variant_interpretation
-            variation_descriptor = variant_interpretation.variation_descriptor
-
-            # 1) Gene symbol & allelic state
-            # 'gene_context' is a message; we need to CopyFrom if setting a message,
-            # but for its scalar fields we can still assign directly:
-            variation_descriptor.gene_context.symbol = genotype_record.gene_symbol
-            variation_descriptor.allelic_state.CopyFrom(
-                pps2.OntologyClass(
-                    id="GENO:"
-                    + genotype_record.zygosity_code,  # or however we decide to construct this later on
-                    label=genotype_record.zygosity,
-                )
+            # Prefer CopyFrom when available (protobuf Message API) to avoid Ruff BLE001 (broad exception). Feature-detecting keeps us compatible with protobuf builds where CopyFrom may or may not exist on the generated message class, without catching a blanket Exception
+            copy_from = getattr(
+                variant_interpretation.variation_descriptor, "CopyFrom", None
             )
-
-            # 2) HGVS expression
-            hgvs_expr = variation_descriptor.expressions.add()
-            # Attempt to set the HGVS syntax enum if available; otherwise skip.
-            try:
-                hgvs_expr.syntax = pps2.VariationDescriptor.Expression.HGVS
-            except AttributeError:
-                pass
-            hgvs_expr.value = genotype_record.hgvsg
-
-            # 3) Genomic location (exact interval) and alleles, if supported
-            try:
-                loc_ctx = variation_descriptor.location
-                # use the nested VariationDescriptor.Location enum
-                loc_ctx.interval.interval_type = (
-                    pps2.VariationDescriptor.Location.Interval.Type.EXACT
-                )
-                loc_ctx.interval.start = genotype_record.start_position
-                loc_ctx.interval.end = genotype_record.end_position
-                loc_ctx.reference_sequence_id = genotype_record.chromosome
-
-                # 4) Reference & alternate alleles
-                variation_descriptor.reference = genotype_record.reference
-                variation_descriptor.alternate = genotype_record.alternate
-            except AttributeError:
-                # some protobuffs give trouble when trying to expose location/alleles so just skip
-                pass
+            if callable(copy_from):
+                copy_from(vd)
+            else:
+                # Fallback when CopyFrom is absent: MergeFrom retains the previous behavior without catching a blanket Exception
+                variant_interpretation.variation_descriptor.MergeFrom(vd)  # type: ignore[attr-defined]
 
         # 3c) Add optional entries (if any):
         for d in patient_data["disease_records"]:
diff --git a/src/P6/genotype.py b/src/P6/genotype.py
index a9a98ef..68210b3 100644
--- a/src/P6/genotype.py
+++ b/src/P6/genotype.py
@@ -3,13 +3,45 @@
 
 Defines the Genotype class which encapsulates all relevant fields for a
 genomic variant entry, with validation of each attribute.
+
+High-level role in P6:
+- Mapper parses Excel rows → builds Genotype objects.
+- Genotype.to_variation_descriptor() returns a GA4GH VariationDescriptor.
+- The CLI assembles VariationDescriptors + HPO features into Phenopackets.
+
+VariantDescriptor construction strategy:
+1) Prefer pyphetools' VariantValidator (VV) using the transcript+c. parsed
+   from `hgvsc` (e.g., "ENST00000205557.12:c.2428G>A"). If VV is reachable
+   and returns a usable object, adapt it directly.
+2) If VV is disabled (P6_SKIP_VV=1/true), missing, offline, or returns a
+   non-standard payload, fall back to a minimal local descriptor:
+   - include a normalized g.HGVS (strip "chr"), gene symbol, and zygosity.
+3) Always de-duplicate expressions so we don't add the same g.HGVS twice.
+
+
+Environment flags
+----------------------------------------
+P6_SKIP_VV=1           : Force the local fallback path (useful for CI/offline).
+P6_ENRICH_GENE_XREFS=1 : If set and vv_lookup is importable, ask VV for HGNC/
+                         Ensembl IDs and add them to gene_context where possible.
 """
 
+from __future__ import annotations
+
+import os
 import re
 from dataclasses import dataclass
-# from typing import ClassVar, Dict, List, Optional
+from typing import Optional, Tuple
+
+import requests
+import phenopackets.schema.v2 as pps2
+from pyphetools.creation.variant_validator import VariantValidator
+
+
+# ----------------------------------
+# Patterns and small constant tables
+# ----------------------------------
 
-# Patterns and allowed enums
 _VALID_ID = re.compile(r"^[A-Za-z0-9]+$")
 _EMAIL_PATTERN = re.compile(r"^[\w\.\+\-]+@[\w\.\-]+\.[A-Za-z]+$")
 _ALLOWED_CHROM_ENCODINGS = {"hgvs", "ucsc", "refseq", "ensembl", "ncbi", "ega"}
@@ -21,7 +53,8 @@
     "mosaic",
 }
 _ALLOWED_INHERITANCE_MODES = {"unknown", "inherited", "de_novo_mutation"}
-# Mapping from the normalized zygosity terms to Genotype Ontology codes
+
+# GENO allelic_state codes mapped from normalized zygosity terms
 _GENO_ALLELIC_STATE_CODES = {
     "heterozygous": "0000135",
     "homozygous": "0000134",
@@ -30,6 +63,38 @@
     "mosaic": "0000150",
 }
 
+# Permissive HGVS g. SNV pattern with optional "chr" prefix (captures chrom/pos/ref/alt)
+_HGVS_G_SNV = re.compile(
+    r"""
+    ^\s*
+    (?:chr)?(?P<chrom>[0-9XYM]+)       # chromosome number or X/Y/M
+    :g\.
+    (?P<pos>\d+)                       # 1-based position
+    (?P<ref>[ACGT]+)>(?P<alt>[ACGT]+)  # simple SNV
+    \s*$
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+
+# Transcript + c. part, e.g. "NM_000000.0:c.100A>G", "ENST00000205557.12:c.2428G>A"
+_HGVSC_TXT_RE = re.compile(
+    r"""
+    ^\s*
+    (?P<tx>
+        (?:N[MR]|X[MR]|E(?:NST)?)      # NM/NR/XM/XR/ENST
+        [_]?\d+(?:\.\d+)?              # id with optional dot-version
+    )
+    :
+    (?P<c>c\..+)$
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+
+
+# ----------------------
+# Core domain data class
+# ----------------------
+
 
 @dataclass
 class Genotype:
@@ -38,17 +103,17 @@ class Genotype:
 
     Attributes:
         genotype_patient_ID: Unique alphanumeric patient identifier.
-        contact_email: Email for follow‑up communications.
+        contact_email: Email for follow-up communications.
         phasing: True if variant is phased, False otherwise.
-        chromosome: Chromosome name or encoding (e.g., 'chr16' or 'hgvs').
-        start_position: 1‑based start coordinate (nonnegative integer).
-        end_position: 1‑based end coordinate (nonnegative integer).
+        chromosome: Chromosome name/encoding (e.g., 'chr16' or 'hgvs').
+        start_position: 1-based start coordinate (non-negative integer).
+        end_position: 1-based end coordinate (non-negative integer).
         reference: Reference allele sequence.
         alternate: Alternate allele sequence.
-        gene_symbol: Official gene symbol (e.g., “BRCA1”).
-        hgvsg: HGVS genomic notation (e.g., “g.100A>T”).
-        hgvsc: HGVS coding DNA notation (e.g., “c.200A>T”).
-        hgvsp: HGVS protein notation (e.g., “p.Lys67Asn”).
+        gene_symbol: Official HGNC gene symbol (e.g., “BRCA1”).
+        hgvsg: Genomic HGVS notation (e.g., “1:g.100A>T” or “chr1:g.100A>T”).
+        hgvsc: Coding DNA HGVS notation (e.g., “NM_000000.0:c.200A>T”).
+        hgvsp: Protein HGVS notation (e.g., “NP_000000.0:p.Lys67Asn”).
         zygosity: One of the allowed zygosity terms.
         inheritance: One of the allowed inheritance modes.
     """
@@ -68,29 +133,29 @@ class Genotype:
     zygosity: str
     inheritance: str
 
-    def __post_init__(self):
-        # Validate patient ID
+    # -----------------------------
+    # Input validation on init time
+    # -----------------------------
+
+    def __post_init__(self) -> None:
+        """Validate basic identifier formats and required string fields."""
         if not _VALID_ID.match(self.genotype_patient_ID):
             raise ValueError(f"Invalid patient ID: {self.genotype_patient_ID!r}")
 
-        # Validate email format
         if not _EMAIL_PATTERN.match(self.contact_email):
             raise ValueError(f"Invalid contact email: {self.contact_email!r}")
 
-        # Validate chromosome: allow either a known encoding or real 'chr*' names
         chrom_lower = self.chromosome.lower()
         if not (
             chrom_lower in _ALLOWED_CHROM_ENCODINGS or chrom_lower.startswith("chr")
         ):
             raise ValueError(f"Unrecognized chromosome: {self.chromosome!r}")
 
-        # Validate positions
         for attr in ("start_position", "end_position"):
             val = getattr(self, attr)
             if not isinstance(val, int) or val < 0:
-                raise ValueError(f"{attr} must be a non‑negative integer, got {val!r}")
+                raise ValueError(f"{attr} must be a non-negative integer, got {val!r}")
 
-        # Validate allele/gene/HGVS strings
         for attr in (
             "reference",
             "alternate",
@@ -103,25 +168,209 @@ def __post_init__(self):
             if not isinstance(val, str) or not val.strip():
                 raise ValueError(f"{attr} must be a nonempty string")
 
-        # Validate zygosity
         if self.zygosity not in _ALLOWED_ZYGOSITIES:
             raise ValueError(f"Invalid zygosity: {self.zygosity!r}")
 
-        # Validate inheritance
         if self.inheritance not in _ALLOWED_INHERITANCE_MODES:
             raise ValueError(f"Invalid inheritance mode: {self.inheritance!r}")
 
+    # ----------------------
+    # Convenience properties
+    # ----------------------
+
     @property
     def zygosity_code(self) -> str:
+        """Return the numeric part of the GENO: allelic_state code for this zygosity."""
+        try:
+            return _GENO_ALLELIC_STATE_CODES[self.zygosity]
+        except KeyError as e:
+            raise ValueError(
+                f"No GENO code defined for zygosity {self.zygosity!r}"
+            ) from e
+
+    # --------------------------------------------------------------------------
+    # Core responsibility: build a VariationDescriptor (VV path or local fallback)
+    # --------------------------------------------------------------------------
+
+    def to_variation_descriptor(self) -> "pps2.VariationDescriptor":
         """
-        Returns the numeric portion of the GENO: alleleic_state code
-        corresponding to this Genotype's zygosity.
+        Build a GA4GH VariationDescriptor for this variant.
+
+        Resolution order:
+        1) If P6_SKIP_VV is set → build locally.
+        2) Else, parse transcript+c. from hgvsc and attempt VV via pyphetools.
+           - If VV returns a usable object: enrich & return.
+           - On VV/network/shape issues: build locally.
+        All paths de-duplicate expressions to avoid double g.HGVS entries.
         """
+        if os.getenv("P6_SKIP_VV", "").strip().lower() in {"1", "true"}:
+            vd = self._build_local_descriptor()
+            return self._enrich_descriptor_common(vd)
+
+        tx, c_part = self._parse_hgvsc(self.hgvsc)
+        if not (tx and c_part):
+            vd = self._build_local_descriptor()
+            return self._enrich_descriptor_common(vd)
+
+        # Try building via VariantValidator; keep failures graceful.
         try:
-            return _GENO_ALLELIC_STATE_CODES[self.zygosity]
-        except KeyError:
-            raise ValueError(f"No GENO code defined for zygosity {self.zygosity!r}")
+            vv = VariantValidator(genome_build="GRCh38", transcript=tx)
+            hv = vv.encode_hgvs(c_part)  # pyphetools expects ONLY the c. part
+            vi = hv.to_variant_interpretation_202()
+            vd = vi.variation_descriptor
+        except (
+            requests.RequestException,
+            ValueError,
+            TypeError,
+            AttributeError,
+            KeyError,
+        ):
+            vd = self._build_local_descriptor()
+
+        return self._enrich_descriptor_common(vd)
+
+    # -----------------------
+    # Internal helper methods
+    # -----------------------
+
+    @staticmethod
+    def _parse_hgvsc(hgvsc: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract transcript identifier and the c. part from an hgvsc string.
 
+        Examples:
+            "NM_000000.0:c.100A>G" -> ("NM_000000.0", "c.100A>G")
+            "ENST00000205557.12:c.2428G>A" -> ("ENST00000205557.12", "c.2428G>A")
+        """
+        if not isinstance(hgvsc, str):
+            return None, None
+        m = _HGVSC_TXT_RE.match(hgvsc.strip())
+        if not m:
+            return None, None
+        return m.group("tx"), m.group("c")
+
+    @staticmethod
+    def _normalize_g_expression(hgvsg: str) -> Optional[str]:
+        """
+        Normalize a genomic HGVS like 'chr16:g.100A>G' -> '16:g.100A>G' for simple SNVs.
+        For non-SNV or non-matching patterns, return the trimmed original string.
+        """
+        if not isinstance(hgvsg, str) or not hgvsg.strip():
+            return None
+        s = hgvsg.strip()
+        m = _HGVS_G_SNV.match(s)
+        if m:
+            chrom = m.group("chrom")
+            pos = m.group("pos")
+            ref = m.group("ref").upper()
+            alt = m.group("alt").upper()
+            return f"{chrom}:g.{pos}{ref}>{alt}"
+        if s.lower().startswith("chr"):
+            return s[3:]
+        return s
+
+    # ---- Descriptor builders --------------------------------------------------
+
+    def _build_local_descriptor(self) -> "pps2.VariationDescriptor":
+        """
+        Construct a minimal local VariationDescriptor using normalized g.HGVS,
+        gene symbol, and zygosity. Used when VV is unavailable or disabled.
+        """
+        vd = pps2.VariationDescriptor()
+
+        # Add g. expression if present (local path never has any expressions yet)
+        g_value = self._normalize_g_expression(self.hgvsg)
+        if g_value:
+            self._add_hgvs_expression(vd, g_value, syntax_name="HGVS")
+
+        # Allelic state (GENO)
+        if self.zygosity:
+            vd.allelic_state.id = f"GENO:{self.zygosity_code}"
+            vd.allelic_state.label = self.zygosity
+
+        # Gene context (optional)
+        if self.gene_symbol:
+            try:
+                vd.gene_context.symbol = self.gene_symbol
+            except (AttributeError, TypeError):
+                # Do not let proto accessors sink the run
+                pass
+
+        return vd
+
+    def _enrich_descriptor_common(
+        self, vd: "pps2.VariationDescriptor"
+    ) -> "pps2.VariationDescriptor":
+        """
+        Common post-processing for both VV-built and locally-built descriptors:
+        - ensure allelic_state matches our zygosity,
+        - ensure gene_context has our gene symbol (if missing),
+        - ensure a normalized g.HGVS expression is present exactly once.
+        """
+        # Allelic state
+        if self.zygosity:
+            vd.allelic_state.id = f"GENO:{self.zygosity_code}"
+            vd.allelic_state.label = self.zygosity
+
+        # Gene symbol (do not overwrite a non-empty symbol VV may have provided)
+        try:
+            gene_ctx = getattr(vd, "gene_context", None)
+            if self.gene_symbol and (
+                gene_ctx is None or not getattr(gene_ctx, "symbol", "")
+            ):
+                vd.gene_context.symbol = self.gene_symbol
+        except (AttributeError, TypeError):
+            pass
+
+        # Add normalized g.HGVS only if not already present (dedupe patch)
+        g_value = self._normalize_g_expression(self.hgvsg)
+        if g_value:
+            self._add_hgvs_expression_if_missing(vd, g_value, syntax_name="HGVS")
+
+        return vd
+
+    # ---- Expression utilities -------------------------------------------------
+
+    @staticmethod
+    def _expression_values(vd: "pps2.VariationDescriptor") -> set[str]:
+        """Collect all current Expression.value strings for fast membership checks."""
+        try:
+            return {e.value for e in vd.expressions}
+        except (AttributeError, TypeError):
+            return set()
+
+    @classmethod
+    def _add_hgvs_expression_if_missing(
+        cls, vd: "pps2.VariationDescriptor", value: str, *, syntax_name: str = "HGVS"
+    ) -> None:
+        """
+        Add a new HGVS Expression only if an identical value is not already present.
+        Keeps VV-returned expressions from being duplicated by local enrichment.
+        """
+        if not value:
+            return
+        if value in cls._expression_values(vd):
+            return
+        cls._add_hgvs_expression(vd, value, syntax_name=syntax_name)
 
-# Map our human‐readable zygosity → the GA4GH GENO codes
-# allelic_state_GENO_zygosity_codes: dict[str, str] = {"heterozygous": "0000135", "homozygous": "0000136", "mosaic": "0000539", "hemizygous": "0000144", "compound_heterozygosity": "0000140"}
+    @staticmethod
+    def _add_hgvs_expression(
+        vd: "pps2.VariationDescriptor", value: str, syntax_name: str = "HGVS"
+    ) -> None:
+        """
+        Append an Expression to a VariationDescriptor.
+
+        Parameters
+        ----------
+        vd : VariationDescriptor
+            The descriptor to mutate.
+        value : str
+            HGVS string to append.
+        syntax_name : str, optional
+            Desired syntax enum name ('HGVS'), by default "HGVS".
+        """
+        expr = vd.expressions.add()
+        expr.value = value
+        enum = getattr(type(expr), syntax_name, None)
+        if enum is not None and hasattr(expr, "syntax"):
+            expr.syntax = enum  # type: ignore[attr-defined]
diff --git a/src/P6/mapper.py b/src/P6/mapper.py
index c17e9b9..0b4433b 100644
--- a/src/P6/mapper.py
+++ b/src/P6/mapper.py
@@ -835,12 +835,13 @@ def _add_phenotypic_features(pkt, phenotypes: list) -> None:
             if not phenotype.status:
                 feature.excluded = True
 
+    @staticmethod
     @staticmethod
     def _add_genotype_interpretations(pkt, genotypes: list, patient_id: str) -> None:
         """
         Add Interpretation → Diagnosis → GenomicInterpretation blocks.
-        Minimal VariationDescriptor with HGVS expression; set optional
-        location/alleles when supported by the installed protobufs.
+        Use the Genotype dataclass helper to build VariationDescriptor,
+        so gene symbol / zygosity / inheritance are preserved.
         """
 
         for interpretation_index, genotype_record in enumerate(genotypes):
@@ -856,38 +857,23 @@ def _add_genotype_interpretations(pkt, genotypes: list, patient_id: str) -> None
                 genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY
             )
 
-            # VariationDescriptor with HGVS expression
+            # VariationDescriptor: delegate to Genotype dataclass
             variant_interpretation = genomic_interpretation_entry.variant_interpretation
             variation_descriptor = variant_interpretation.variation_descriptor
 
-            expression = variation_descriptor.expressions.add()
-            # Attempt to set the HGVS syntax enum if available
-            try:
-                expression.syntax = pps2.VariationDescriptor.Expression.HGVS
-            except AttributeError:
-                pass
-            # expression.value = genotype_record.hgvsg or ""
-            # Canonicalize: serialize without optional 'chr' prefix so it matches
-            # expected '16:g.100A>G' style while still accepting either form as input.
-            hgvs = (genotype_record.hgvsg or "").strip()
-            if hgvs.lower().startswith("chr"):
-                hgvs = hgvs[3:]
-            expression.value = hgvs
-
-            # Optional: attempt to set a subset of location/alleles if supported
             try:
-                location_context = variation_descriptor.location
-                location_context.interval.interval_type = (
-                    pps2.VariationDescriptor.Location.Interval.Type.EXACT
-                )
-                location_context.interval.start = genotype_record.start_position
-                location_context.interval.end = genotype_record.end_position
-                location_context.reference_sequence_id = genotype_record.chromosome
-                variation_descriptor.reference = genotype_record.reference
-                variation_descriptor.alternate = genotype_record.alternate
+                variation_descriptor.CopyFrom(genotype_record.to_variation_descriptor())
             except AttributeError:
-                # Some library builds do not expose these submessages; skip gracefully.
-                pass
+                # Fallback to old behavior if helper not available
+                expression = variation_descriptor.expressions.add()
+                try:
+                    expression.syntax = pps2.VariationDescriptor.Expression.HGVS
+                except AttributeError:
+                    pass
+                hgvs = (genotype_record.hgvsg or "").strip()
+                if hgvs.lower().startswith("chr"):
+                    hgvs = hgvs[3:]
+                expression.value = hgvs
 
     @staticmethod
     def _add_diseases_to_packet(pkt, diseases: list) -> None:
diff --git a/src/P6/vv_lookup.py b/src/P6/vv_lookup.py
new file mode 100644
index 0000000..df9b3f8
--- /dev/null
+++ b/src/P6/vv_lookup.py
@@ -0,0 +1,223 @@
+"""
+VariantValidator cross-reference helpers.
+
+High level (P6 perspective)
+---------------------------
+This module is an optional enrichment layer used by P6 to attach gene
+cross-references (HGNC ID, Ensembl gene ID, and canonical transcript
+accessions) to the VariationDescriptor's gene_context, *after* the core
+variant normalization path has succeeded.
+
+Key behaviors
+-------------
+- Calls VariantValidator's `gene2transcripts` endpoints (v2 preferred, v1 fallback).
+- Normalizes the sometimes-variable VV payloads into a small, stable dict:
+  {
+      "hgnc_id": "HGNC:####",
+      "ensembl_gene_id": "ENSG###########",
+      "refseq_transcripts": [...],
+      "ensembl_transcripts": [...]
+  }
+- Uses small retry/backoff for resilience.
+- Is deliberately decoupled from the main parsing flow: any failure raises
+  `VVLookupError`; the caller should catch and ignore if enrichment isn't critical.
+
+Environment
+-----------
+VV_BASE_URL   : Optional base URL override (default "https://rest.variantvalidator.org")
+"""
+
+from __future__ import annotations
+
+from functools import lru_cache
+from typing import Any, Dict, List
+import json
+import os
+import time
+from urllib.parse import quote as _urlencode
+
+import requests
+
+
+class VVLookupError(RuntimeError):
+    """Raised when VariantValidator enrichment lookups fail."""
+
+
+# ------------------------------------------------------------------------------
+# Module configuration
+# ------------------------------------------------------------------------------
+
+_VV_BASE = os.getenv("VV_BASE_URL", "https://rest.variantvalidator.org").rstrip("/")
+
+
+# ------------------------------------------------------------------------------
+# Small utilities
+# ------------------------------------------------------------------------------
+
+
+def _sleep_backoff(i: int) -> None:
+    """
+    Sleep using a small exponential backoff (polite to the VV API).
+    Sequence ~ 0.25s, 0.5s, 1s, 2s.
+    """
+    time.sleep(0.25 * (2**i))
+
+
+def _request_json(url: str, *, timeout: float = 10.0) -> dict:
+    """
+    GET JSON with simple retry/backoff for VV endpoints.
+
+    Retries a few times on network/HTTP/JSON decode problems and raises
+    VVLookupError if all attempts fail.
+    """
+    last_exc: Exception | None = None
+    for i in range(4):  # attempts: 0,1,2,3
+        try:
+            resp = requests.get(url, timeout=timeout)
+            resp.raise_for_status()
+            return resp.json()
+        except (requests.RequestException, json.JSONDecodeError, ValueError) as e:
+            last_exc = e
+            _sleep_backoff(i)
+    assert last_exc is not None
+    raise VVLookupError(f"Failed GET {url}: {last_exc}") from last_exc
+
+
+# ------------------------------------------------------------------------------
+# Payload normalizers (keep public API stable even if VV changes shape)
+# ------------------------------------------------------------------------------
+
+
+def _parse_v2_payload(payload: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Normalize the gene2transcripts_v2 response into a compact dict.
+
+    Expected output keys:
+      - hgnc_id (str)
+      - ensembl_gene_id (str)
+      - refseq_transcripts (List[str])
+      - ensembl_transcripts (List[str])
+    """
+    out: Dict[str, Any] = {
+        "hgnc_id": "",
+        "ensembl_gene_id": "",
+        "refseq_transcripts": [],
+        "ensembl_transcripts": [],
+    }
+    if not isinstance(payload, dict):
+        return out
+
+    hgnc = payload.get("hgnc", {})
+    if isinstance(hgnc, dict):
+        out["hgnc_id"] = hgnc.get("hgnc_id", "") or hgnc.get("HGNC_ID", "")
+        out["ensembl_gene_id"] = hgnc.get("ensembl_gene_id", "") or hgnc.get(
+            "ensembl", ""
+        )
+
+    def _collect(lst: Any) -> List[str]:
+        accs: List[str] = []
+        if isinstance(lst, list):
+            for item in lst:
+                if isinstance(item, dict) and item.get("accession"):
+                    accs.append(str(item["accession"]))
+        return accs
+
+    out["refseq_transcripts"] = _collect(payload.get("refseq"))
+    out["ensembl_transcripts"] = _collect(payload.get("ensembl"))
+    return out
+
+
+def _parse_v1_payload(payload: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Normalize the gene2transcripts (v1) response into a compact dict.
+
+    v1 is simpler; transcripts are often plain string lists.
+    """
+    out: Dict[str, Any] = {
+        "hgnc_id": "",
+        "ensembl_gene_id": "",
+        "refseq_transcripts": [],
+        "ensembl_transcripts": [],
+    }
+    if not isinstance(payload, dict):
+        return out
+
+    out["hgnc_id"] = payload.get("hgnc_id", "") or payload.get("HGNC_ID", "")
+    out["ensembl_gene_id"] = payload.get("ENSEMBL", "") or payload.get("ensembl", "")
+
+    rs = payload.get("refseq") or payload.get("RefSeq") or []
+    if isinstance(rs, list):
+        out["refseq_transcripts"] = [str(r) for r in rs if isinstance(r, str)]
+
+    es = payload.get("ensembl_transcripts") or payload.get("ensembl") or []
+    if isinstance(es, list):
+        out["ensembl_transcripts"] = [str(e) for e in es if isinstance(e, str)]
+
+    return out
+
+
+# ------------------------------------------------------------------------------
+# Public API
+# ------------------------------------------------------------------------------
+
+
+@lru_cache(maxsize=2048)
+def get_gene_xrefs_vv(
+    gene_query: str,
+    *,
+    genome_build: str = "GRCh38",
+    transcript_set: str = "refseq",  # {refseq, ensembl, all}
+    limit_transcripts: str = "mane",  # {mane, mane_select, select, raw, all}
+) -> Dict[str, Any]:
+    """
+    Fetch HGNC, Ensembl, and transcript xrefs for an HGNC symbol/ID or transcript.
+
+    Parameters
+    ----------
+    gene_query : str
+        HGNC symbol/ID (e.g., 'ABCC6' or 'HGNC:36') or a transcript ID (NM_/ENST_).
+    genome_build : str, optional
+        'GRCh37' or 'GRCh38' (default 'GRCh38').
+    transcript_set : str, optional
+        'refseq', 'ensembl', or 'all' (default 'refseq').
+    limit_transcripts : str, optional
+        VV's limiting switch (default 'mane'). Useful values: 'mane', 'mane_select', 'select'.
+
+    Returns
+    -------
+    dict
+        Compact dict with keys: 'hgnc_id', 'ensembl_gene_id',
+        'refseq_transcripts', 'ensembl_transcripts'.
+
+    Raises
+    ------
+    VVLookupError
+        If VV is unreachable or returns an unparseable/empty payload.
+    """
+    if not gene_query or not isinstance(gene_query, str):
+        raise VVLookupError("gene_query must be a non-empty string")
+    gene_query = gene_query.strip()
+
+    # Preferred v2 endpoint
+    v2_url = (
+        f"{_VV_BASE}/VariantValidator/tools/gene2transcripts_v2/"
+        f"{_urlencode(gene_query)}/{_urlencode(limit_transcripts)}/"
+        f"{_urlencode(transcript_set)}/{_urlencode(genome_build)}"
+        "?content-type=application%2Fjson"
+    )
+    v2_data = _request_json(v2_url)
+    v2_norm = _parse_v2_payload(v2_data)
+    if any(v2_norm.values()):
+        return v2_norm
+
+    # Fallback to the simpler v1 endpoint
+    v1_url = (
+        f"{_VV_BASE}/VariantValidator/tools/gene2transcripts/"
+        f"{_urlencode(gene_query)}?content-type=application%2Fjson"
+    )
+    v1_data = _request_json(v1_url)
+    v1_norm = _parse_v1_payload(v1_data)
+    if any(v1_norm.values()):
+        return v1_norm
+
+    raise VVLookupError(f"No xrefs found for {gene_query!r}")