[TRTLLM-11851][feat] MX adapter improvements: env-var fallback, query timeout, model_name plumbing

chienchunhung · chienchunhung · commit 1b7b01362f81 · 2026-04-21T14:55:26.000-07:00
Three discrete improvements to the MX side of PR NVIDIA#13045 driven by review feedback from MX team's downstream PR (chienchunhung/TensorRT-LLM #1) — three orchestration ergonomics fixes landed as one focused commit so reviewers see them as a clean slice on top of the prototype. (1) MODEL_EXPRESS_URL env-var fallback — at validator level TorchLlmArgs.validate_mx_config now honors the upstream ``MODEL_EXPRESS_URL`` env var when ``checkpoint_format='MX'`` and ``mx_server_url`` is unset. Resolution happens at validator time so the value ends up on ``llm_args.mx_server_url`` (visible to logging, /startup_metrics, downstream code) instead of being silently re-read from env by the loader. Lets orchestrators (Dynamo) configure MX via the environment without plumbing every CLI knob, while keeping resolution in one place. Explicit ``mx_server_url=`` always wins. The env-var fallback only fires when MX is the active checkpoint format (so HF-only configs aren't surprised by an unrelated env var). Empty string in env is treated as unset. (2) MX_SOURCE_QUERY_TIMEOUT defensive default MXCheckpointLoader.__init__ calls ``os.environ.setdefault("MX_SOURCE_QUERY_TIMEOUT", "30")`` whenever an MX server URL is configured. Caps cold-cluster first-replica startup at 30 s instead of upstream's 1-hour default (the polling in MxLiveWeightLoader._query_source). setdefault semantics preserve any explicit user value. HF-only loads (no MX URL) don't touch the env at all. The proper upstream-side fix is a non-blocking source-query API (tracked as MX-4 in §15 of the design doc); this defensive default caps the worst case until that lands. (3) model_name plumbing with HF-snapshot-aware resolver Plumbs ``llm_args.model → MXCheckpointLoader(model_name=...)`` so upstream's ``publish_model_params()`` publishes under the user-supplied Hub ID (e.g. "Qwen/Qwen2.5-72B-Instruct") instead of the "unknown" sentinel. - MXCheckpointLoader takes a new optional ``model_name`` constructor arg (Union[str, Path]). Coerced to str at construction time. - publish_as_source() now sets BOTH MODEL_EXPRESS_URL and MODEL_NAME env vars (resolving identity via the priority order below) and restores both env vars in finally. publish_model_params() reads them via env, as documented. - Identity resolution order: explicit constructor arg → MODEL_NAME env → checkpoint_dir basename (with HF-snapshot path unmangling) → "unknown". - HF cache layout (".../models--<org>--<name>/snapshots/<sha>/") is unmangled back to "<org>/<name>" instead of returning the commit hash. - _construct_checkpoint_loader plumbs ``mx_model_name`` through; py_executor_creator.py extracts it from llm_args.model. Both env-var dances (MODEL_EXPRESS_URL + MODEL_NAME) collapse into one direct call when MX-2 (public build_identity) lands upstream. Tests for these three additions are in the next commit. Signed-off-by: Chien-Chun Hung <2679986+chienchunhung@users.noreply.github.com> Made-with: Cursor
diff --git a/tensorrt_llm/_torch/models/checkpoints/mx/checkpoint_loader.py b/tensorrt_llm/_torch/models/checkpoints/mx/checkpoint_loader.py
@@ -28,7 +28,9 @@
 ``HfCheckpointLoader`` base class.
 """
 
-from typing import Any, Optional
+import os
+from pathlib import Path
+from typing import Any, Optional, Union
 
 from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader
 from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader
@@ -38,6 +40,16 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
+# Defensive default for the upstream ``MX_SOURCE_QUERY_TIMEOUT`` env var.
+# The upstream ``MxLiveWeightLoader`` polls the MX server every 5 s for up
+# to ``MX_SOURCE_QUERY_TIMEOUT`` seconds (default 3600 = 1 hour) waiting
+# for a source. On a cold cluster (no donor up yet), this means the very
+# first replica blocks for an hour before falling back to disk. We cap
+# the default at 30 s so first-replica startup degrades gracefully; users
+# can still override via the env var or a future per-loader knob.
+# Tracked as MX-4 in §15 (non-blocking source-query API upstream).
+_MX_SOURCE_QUERY_TIMEOUT_DEFAULT_S = "30"
+
 
 @register_checkpoint_loader("MX")
 class MXCheckpointLoader(HfCheckpointLoader):
@@ -68,6 +80,7 @@ def __init__(
         weight_mapper: Optional[BaseWeightMapper] = None,
         config_loader: Optional[BaseConfigLoader] = None,
         mx_server_url: Optional[str] = None,
+        model_name: Optional[Union[str, Path]] = None,
     ):
         super().__init__(
             weight_loader=weight_loader,
@@ -78,8 +91,24 @@ def __init__(
         # caller reading self._checkpoint_format directly also sees "MX".
         self._checkpoint_format = "MX"
         self._mx_server_url = mx_server_url
+        # ``model_name`` is the human-readable identity to publish/look up
+        # under on the MX server. Typically the user-supplied
+        # ``llm_args.model`` (a Hub ID like ``"Qwen/Qwen2.5-72B-Instruct"``
+        # or a local path). ``publish_as_source()`` resolves it via
+        # :func:`_resolve_mx_model_name` (with HF-snapshot path fallback).
+        self._model_name = str(model_name) if model_name is not None else None
         self._p2p_succeeded = False
 
+        # Defensive default for upstream's source-query timeout. Only
+        # applied when an MX server URL is configured (so HF-only loads
+        # are unaffected). Uses ``setdefault`` so an explicit user value
+        # always wins.
+        if mx_server_url is not None:
+            os.environ.setdefault(
+                "MX_SOURCE_QUERY_TIMEOUT",
+                _MX_SOURCE_QUERY_TIMEOUT_DEFAULT_S,
+            )
+
     @property
     def checkpoint_format(self) -> str:
         """Override parent's checkpoint_format to return 'MX'."""
@@ -89,6 +118,17 @@ def checkpoint_format(self) -> str:
     def mx_server_url(self) -> Optional[str]:
         return self._mx_server_url
 
+    @property
+    def model_name(self) -> Optional[str]:
+        """Explicit model identity passed to the constructor (if any).
+
+        Note this is the *as-configured* value (e.g. ``llm_args.model``),
+        not the final resolved identity that ends up in the published
+        ``MODEL_NAME``. The full resolution (with env var and basename
+        fallbacks) happens inside :meth:`publish_as_source`.
+        """
+        return self._model_name
+
     @property
     def p2p_succeeded(self) -> bool:
         """Whether the last load_weights() call used P2P transfer.
@@ -221,11 +261,12 @@ def publish_as_source(self, model, mapping: Mapping = None, checkpoint_dir: str
             mapping: Distributed mapping. Currently unused — kept for
                 signature symmetry with the prior prototype API and for
                 forward-compat with future upstream signatures.
-            checkpoint_dir: Checkpoint directory. Currently unused —
-                upstream uses the ``MODEL_NAME`` env var for identity.
+            checkpoint_dir: Checkpoint directory. Used as a last-resort
+                fallback for resolving the ``MODEL_NAME`` identity when
+                neither ``model_name`` was passed to the constructor nor
+                ``MODEL_NAME`` is set in the environment.
         """
-        # mapping/checkpoint_dir are deliberately unused; see docstring.
-        del mapping, checkpoint_dir
+        del mapping  # currently unused; see docstring.
 
         if self._mx_server_url is None:
             return
@@ -238,17 +279,29 @@ def publish_as_source(self, model, mapping: Mapping = None, checkpoint_dir: str
             logger.debug("modelexpress library not installed; skipping MX publish.")
             return
 
-        # Upstream publish_model_params reads MODEL_EXPRESS_URL from env;
-        # set it from our config so the per-server URL is respected.
-        import os
+        # Upstream publish_model_params reads MODEL_EXPRESS_URL and
+        # MODEL_NAME from the environment. Set both from our resolved
+        # configuration so per-instance values (URL passed via
+        # llm_args.mx_server_url, identity from llm_args.model) are
+        # respected, then restore prior state. Tracked as MX-2 in §15
+        # (the env-var dance goes away when upstream exports a public
+        # ``build_identity()`` we can call directly).
+        resolved_name = _resolve_mx_model_name(self._model_name, checkpoint_dir)
+
+        env_overrides = {
+            "MODEL_EXPRESS_URL": self._mx_server_url,
+            "MODEL_NAME": resolved_name,
+        }
+        prior = {key: os.environ.get(key) for key in env_overrides}
+        for key, value in env_overrides.items():
+            os.environ[key] = value
 
-        prior_url = os.environ.get("MODEL_EXPRESS_URL")
-        os.environ["MODEL_EXPRESS_URL"] = self._mx_server_url
         try:
             publish_model_params(model)
             logger.info(
-                "Published weights to MX server at %s",
+                "Published weights to MX server at %s as model=%r",
                 self._mx_server_url,
+                resolved_name,
             )
         except Exception as e:
             logger.warning(
@@ -257,7 +310,65 @@ def publish_as_source(self, model, mapping: Mapping = None, checkpoint_dir: str
                 e,
             )
         finally:
-            if prior_url is None:
-                os.environ.pop("MODEL_EXPRESS_URL", None)
-            else:
-                os.environ["MODEL_EXPRESS_URL"] = prior_url
+            for key, prior_value in prior.items():
+                if prior_value is None:
+                    os.environ.pop(key, None)
+                else:
+                    os.environ[key] = prior_value
+
+
+# ---------------------------------------------------------------------------
+# Module-level helpers
+# ---------------------------------------------------------------------------
+
+
+def _resolve_mx_model_name(model_name_arg: Optional[str], checkpoint_dir: Optional[str]) -> str:
+    """Resolve a stable model identity for publishing to the MX server.
+
+    Resolution order (first non-empty wins):
+
+    1. ``model_name_arg`` — the explicit value passed at construction
+       time (typically ``llm_args.model``: a Hub ID like
+       ``"Qwen/Qwen2.5-72B-Instruct"`` or a local path).
+    2. ``MODEL_NAME`` env var — upstream's existing convention.
+    3. ``checkpoint_dir`` basename, with HF-snapshot path fallback so
+       ``.../models--<org>--<name>/snapshots/<sha>/`` resolves to
+       ``"<org>/<name>"`` instead of the commit hash.
+    4. Literal ``"unknown"`` — matches upstream's own sentinel.
+    """
+    candidate = model_name_arg or os.environ.get("MODEL_NAME") or checkpoint_dir
+    if not candidate:
+        return "unknown"
+    return _normalize_model_identity(str(candidate))
+
+
+def _normalize_model_identity(s: str) -> str:
+    """Convert a model identifier to a stable, human-readable name.
+
+    Hub IDs (``"org/name"``) and arbitrary user-provided strings are
+    returned unchanged. Filesystem paths are reduced to a basename, with
+    HuggingFace cache snapshot layouts (``snapshots/<commit-sha>/``)
+    walked up to recover the original ``"org/name"`` identity.
+    """
+    if not s:
+        return "unknown"
+
+    # Heuristic: a Hub ID is bare ``"name"`` or ``"org/name"``. Anything
+    # that starts with a path separator/expansion or contains more than
+    # one "/" is treated as a path. Single-"/" strings remain ambiguous;
+    # we side with the Hub ID interpretation unless the path also exists
+    # on disk (in which case we assume the user gave us a real path).
+    looks_like_path = s.startswith(("/", "./", "../", "~")) or s.count("/") > 1 or os.path.exists(s)
+    if not looks_like_path:
+        return s
+
+    p = Path(s).expanduser()
+    name = p.name
+    if name and "snapshots" in p.parts:
+        # HF cache layout: ``.../models--<org>--<name>/snapshots/<sha>/``.
+        # Walk up to find the ``models--<org>--<name>`` directory and
+        # un-mangle it back to ``"<org>/<name>"``.
+        for ancestor in p.parents:
+            if ancestor.name.startswith("models--"):
+                return ancestor.name[len("models--") :].replace("--", "/")
+    return name or "unknown"
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -171,6 +171,7 @@ def _construct_checkpoint_loader(
     checkpoint_format: Optional[str],
     *,
     mx_server_url: Optional[str] = None,
+    mx_model_name: Optional[str] = None,
 ) -> Optional[BaseCheckpointLoader]:
     if backend == "_autodeploy":
         return None
@@ -187,8 +188,11 @@ def _construct_checkpoint_loader(
 
         # Pass extra kwargs for format-specific loaders (e.g. MX).
         extra_kwargs: dict = {}
-        if checkpoint_format == "MX" and mx_server_url is not None:
-            extra_kwargs["mx_server_url"] = mx_server_url
+        if checkpoint_format == "MX":
+            if mx_server_url is not None:
+                extra_kwargs["mx_server_url"] = mx_server_url
+            if mx_model_name is not None:
+                extra_kwargs["model_name"] = mx_model_name
 
         checkpoint_loader = BaseCheckpointLoader.get(
             checkpoint_format=checkpoint_format,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -252,11 +252,16 @@ def create_py_executor(
     skip_est = os.environ.get("TRTLLM_SKIP_KV_CACHE_ESTIMATION", '0') == '1'
     torch.cuda.set_per_process_memory_fraction(1.0)
     # Apply model-specific defaults early, before destructuring llm_args fields
+    # Pass llm_args.model through to MXCheckpointLoader so it can publish
+    # to the MX server under the user-supplied identity (Hub ID or local
+    # path basename) instead of defaulting to "unknown".
     checkpoint_loader = _construct_checkpoint_loader(
         llm_args.backend,
         llm_args.checkpoint_loader,
         llm_args.checkpoint_format,
         mx_server_url=llm_args.mx_server_url,
+        mx_model_name=str(llm_args.model)
+        if llm_args.model is not None else None,
     )
     llm_args = ModelLoader.load_config_and_apply_defaults(
         checkpoint_dir, llm_args, checkpoint_loader)
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -3940,6 +3940,22 @@ def validate_checkpoint_format(self):
 
     @model_validator(mode="after")
     def validate_mx_config(self) -> 'TorchLlmArgs':
+        # When MX is the active checkpoint format and the user did not
+        # explicitly set ``mx_server_url``, honor the ``MODEL_EXPRESS_URL``
+        # env var that the upstream ``modelexpress`` library reads
+        # (see ``modelexpress.client._get_server_url``). This lets
+        # orchestrators (e.g. Dynamo) configure MX via the environment
+        # without plumbing every CLI knob through, while keeping the
+        # resolved value visible on ``llm_args.mx_server_url`` for
+        # logging, ``/startup_metrics``, and downstream code paths.
+        if (self.checkpoint_format == "MX" and self.mx_server_url is None):
+            env_url = os.environ.get("MODEL_EXPRESS_URL")
+            if env_url:
+                logger.info(
+                    "mx_server_url not set; using MODEL_EXPRESS_URL=%s "
+                    "from environment.", env_url)
+                self.mx_server_url = env_url
+
         if self.mx_server_url is not None and self.checkpoint_format != "MX":
             logger.warning(
                 "mx_server_url is set but checkpoint_format is '%s', not "