erictang000
diff --git a/‎pyproject.toml‎
Lines changed: 16 additions & 24 deletions b/‎pyproject.toml‎
Lines changed: 16 additions & 24 deletions
diff --git a/‎skyrl/backends/skyrl_train/inference_engines/vllm/vllm_engine.py‎
Lines changed: 4 additions & 4 deletions b/‎skyrl/backends/skyrl_train/inference_engines/vllm/vllm_engine.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎skyrl/backends/skyrl_train/inference_servers/layerwise_reload.py‎
Lines changed: 72 additions & 34 deletions b/‎skyrl/backends/skyrl_train/inference_servers/layerwise_reload.py‎
Lines changed: 72 additions & 34 deletions
diff --git a/‎skyrl/backends/skyrl_train/inference_servers/new_inference_worker_wrap.py‎
Lines changed: 5 additions & 5 deletions b/‎skyrl/backends/skyrl_train/inference_servers/new_inference_worker_wrap.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎skyrl/backends/skyrl_train/inference_servers/remote_inference_client.py‎
Lines changed: 6 additions & 6 deletions b/‎skyrl/backends/skyrl_train/inference_servers/remote_inference_client.py‎
Lines changed: 6 additions & 6 deletions
@@ -19,7 +19,6 @@ dependencies = [
     "tokenizers>=0.21.2",
     "transformers>=5.6.1,<=5.8.0",
     "typer>=0.17.4",
-    # "wandb>=0.22.0",
     "peft==0.18.1",
     "hf_transfer",
     "cloudpathlib>=0.23.0",
@@ -106,7 +105,7 @@ skyrl-train = [
 
 fsdp = [
     "skyrl[skyrl-train]",
-    "vllm==0.20.2; sys_platform == 'linux'",
+    "vllm==0.23.0; sys_platform == 'linux'",
     "vllm-router; sys_platform == 'linux'",
     # The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
     # `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
@@ -117,9 +116,9 @@ fsdp = [
     "causal-conv1d; sys_platform == 'linux'",
     "flash-attn==2.8.3; sys_platform == 'linux'",
     "torch==2.11.0; sys_platform == 'linux'",
-    "flashinfer-python==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
-    "flashinfer-jit-cache==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
-    "flashinfer-cubin==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-python==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-jit-cache==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-cubin==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
     "torchvision; sys_platform == 'linux'",
 ]
 
@@ -130,22 +129,22 @@ megatron = [
     "flash-linear-attention; sys_platform == 'linux'",
     "causal-conv1d; sys_platform == 'linux'",
     "mamba-ssm>=2.3.0; sys_platform == 'linux'",
-    "vllm==0.20.2; sys_platform == 'linux'",
+    "vllm==0.23.0; sys_platform == 'linux'",
     "vllm-router; sys_platform == 'linux'",
     # The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
     # `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
     # Its metadata hard-depends on `nixl-cu13` too; that variant is overridden
     # out below (it would drag in the CUDA-13 stack and break the cu12 torch pin).
     "nixl; sys_platform == 'linux'",
     "torch==2.11.0; sys_platform == 'linux'",
-    "flashinfer-python==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-python==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
     "torchvision; sys_platform == 'linux'",
     # megatron-bridge requires Python 3.12+; pin megatron-core to the same
     # constraint so both packages are consistently available (or absent).
     "megatron-bridge; sys_platform == 'linux' and python_version >= '3.12'",
     "megatron-core; sys_platform == 'linux' and python_version >= '3.12'",
-    "flashinfer-jit-cache==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
-    "flashinfer-cubin==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-jit-cache==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-cubin==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
     "nvidia-modelopt; sys_platform == 'linux'",
 ]
 
@@ -201,18 +200,6 @@ required-environments = [
     "sys_platform == 'darwin' and platform_machine == 'arm64'",
 ]
 
-constraint-dependencies = [
-    "flashinfer-jit-cache==0.6.8.post1",
-    "flashinfer-cubin==0.6.8.post1",
-    # fastapi 0.137.0 refactored include_router() to store `_IncludedRouter` wrapper objects in
-    # `app.routes`, which prometheus-fastapi-instrumentator (pulled in transitively by vLLM) cannot
-    # handle: `_get_route_name` accesses `route.path` and raises
-    # `AttributeError: '_IncludedRouter' object has no attribute 'path'`, so the vLLM server's /health
-    # endpoint 500s and the server never becomes healthy. Cap below 0.137 until the instrumentator is
-    # fixed. See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/370 and
-    # https://github.com/vllm-project/vllm/issues/45596
-    "fastapi<0.137",
-]
 # each backend should have separate dependencies that can potentially clash
 # megatron also clashes with the jax dependency from gpu and tpu extras
 conflicts = [
@@ -247,7 +234,12 @@ override-dependencies = [
     "transformer-engine-cu13; sys_platform == 'never'",
     # `nixl` hard-depends on both nixl-cu12 and nixl-cu13; drop the cu13 variant
     # so it doesn't pull the CUDA-13 stack and bump torch off the cu12 pin.
-    "nixl-cu13; sys_platform == 'never'"
+    "nixl-cu13; sys_platform == 'never'",
+    # Megatron-Bridge pins flashinfer-python==0.6.8.post1, which conflicts with
+    # our exact 0.6.12 pin (the version vLLM 0.23.0 requires). Override it to our version.
+    "flashinfer-python==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-jit-cache==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
+    "flashinfer-cubin==0.6.12; sys_platform == 'linux' and platform_machine == 'x86_64'",
 ]
 
 [tool.uv.extra-build-dependencies]
@@ -289,14 +281,14 @@ explicit = true
 
 [[tool.uv.index]]
 name = "vllm-cu129"
-url = "https://wheels.vllm.ai/0.20.2/cu129"
+url = "https://wheels.vllm.ai/0.23.0/cu129"
 explicit = true
 
 [tool.uv.sources]
 skyrl-gym = { path = "./skyrl-gym", editable = true }
 # Match torch's CUDA variant (cu128).
 flashinfer-jit-cache = { index = "flashinfer-cu128", marker = "sys_platform == 'linux'" }
-# vllm 0.20.2's PyPI wheel needs CUDA 13 (libcudart.so.13); the cu129 wheel
+# vllm 0.23.0's PyPI wheel needs CUDA 13 (libcudart.so.13); the cu129 wheel
 # links libcudart.so.12, which torch+cu128 supplies.
 vllm = [
     { index = "vllm-cu129", marker = "sys_platform == 'linux'" },
 
@@ -378,13 +378,13 @@ async def start_weight_update(self, is_checkpoint_format: bool = True):
         engine = self._get_engine()
         return await asyncio.to_thread(
             engine.collective_rpc,
-            "start_weight_update",
+            "skyrl_start_weight_update",
             args=(is_checkpoint_format,),
         )
 
     async def finish_weight_update(self):
         engine = self._get_engine()
-        return await asyncio.to_thread(engine.collective_rpc, "finish_weight_update")
+        return await asyncio.to_thread(engine.collective_rpc, "skyrl_finish_weight_update")
 
 
 class AsyncVLLMInferenceEngine(BaseVLLMInferenceEngine):
@@ -608,13 +608,13 @@ async def _teardown_weight_receiver(self):
     async def start_weight_update(self, is_checkpoint_format: bool = True):
         engine = self._get_engine()
         return await engine.collective_rpc(
-            "start_weight_update",
+            "skyrl_start_weight_update",
             args=(is_checkpoint_format,),
         )
 
     async def finish_weight_update(self):
         engine = self._get_engine()
-        return await engine.collective_rpc("finish_weight_update")
+        return await engine.collective_rpc("skyrl_finish_weight_update")
 
     # ----------------------------------------
     # Methods for handling OpenAI API requests
 
@@ -6,6 +6,8 @@
 layerwise reload once per weight sync rather than once per chunk.
 """
 
+import inspect
+from collections.abc import Callable
 from typing import TYPE_CHECKING
 
 import torch
@@ -14,40 +16,61 @@
     from vllm.config import ModelConfig, VllmConfig
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
-# Workaround for a vLLM layerwise-reload corruption affecting NemotronH/Mamba.
-# MambaMixer2 registers `conv_weights` as a non-persistent buffer that is a
-# view of `self.conv1d.weight.data` (shared storage). vLLM's reload code path
-# (model_executor/model_loader/reload/layerwise.py) materializes the buffer
-# into a fresh uninitialized GPU tensor and then runs
-# `kernel_conv_weights.data.copy_(fresh)` in `_copy_and_restore_kernel_tensors`.
-# Because the kernel buffer shares storage with `conv1d.weight.data`, this
-# writes garbage (NaN-bit-pattern bytes in bf16) into the conv1d weight,
-# corrupting all 23 Mamba layers after every weight sync.
-#
-# Adding "conv_weights" to vLLM's SKIP_TENSORS makes capture/restore/materialize
-# skip the buffer entirely, so the view stays intact and conv1d.weight is
-# preserved. Must be applied before `record_metadata_for_reloading` runs at
-# model construction; this module is imported by vLLM via
-# --worker-extension-cls before model init, so the import-time patch is
-# correctly ordered.
-# Remove this pending https://github.com/vllm-project/vllm/pull/42481 which should
-# be included in vLLM 0.21.0
-try:
-    # Guarded import: vllm is a Linux-only optional dependency, so this module stays importable on macOS / CI.
-    from vllm.model_executor.model_loader.reload.meta import (
-        SKIP_TENSORS as _VLLM_SKIP_TENSORS,
-    )
-
-    _VLLM_SKIP_TENSORS.add("conv_weights")
-except ImportError:
-    pass
+
+def get_numel_loaded(weight_loader: Callable, args: inspect.BoundArguments) -> tuple[int, object]:
+    """
+    Determine how many elements would be loaded by a weight loader call.
+
+    Args:
+        weight_loader: used to load weights
+        args: bound arguments to weight loader
+
+    Returns:
+        number of elements loaded by the weight loader, the return value of the
+        weight loader
+    """
+    # Lazy import: vllm is a Linux-only optional dependency, so this module stays importable on macOS / CI.
+    from vllm.model_executor.model_loader.reload.meta import CopyCounter
+
+    with CopyCounter() as counter:
+        return_value = weight_loader(*args.args, **args.kwargs)
+
+    # A weight loader fills a single destination parameter, so the number of
+    # loaded elements is at most that parameter's size. Some loaders copy into
+    # the parameter more than once -- e.g. ``composed_weight_loader`` runs an
+    # in-place post-load transform (``param.copy_(fn(param))``) on top of the
+    # initial copy -- which would make CopyCounter report twice the parameter
+    # size. Over-counting inflates the layer's loaded-element total and can
+    # finalize the layer before every parameter is loaded, silently dropping
+    # the trailing parameter(s) (e.g. Mamba ``mixer.D``). Cap the count at the
+    # destination size to keep the per-layer accounting correct.
+    numel = counter.copied_numel
+    param = args.arguments.get("param", None)
+    if isinstance(param, torch.Tensor):
+        numel = min(numel, param.numel())
+    return numel, return_value
+
+
+def patch_numel_loaded():
+    # vLLM's layerwise reload binds get_numel_loaded at import time
+    # (`from .meta import get_numel_loaded`), so its call site at
+    # layerwise.py uses the `layerwise` module's own binding. Rebind that
+    # attribute to our patched version to substitute the symbol.
+    from vllm.model_executor.model_loader.reload import layerwise as _layerwise
+    from vllm.model_executor.model_loader.reload import meta as _meta
+
+    _layerwise.get_numel_loaded = get_numel_loaded
+    _meta.get_numel_loaded = get_numel_loaded
+
+
+_PATCHED_LAYERWISE_NUMEL_LOADED = False
 
 
 class LayerwiseReloadWorkerMixin:
     """Bracket a multi-chunk weight sync with one vLLM layerwise-reload init/finalize.
 
-    `start_weight_update` initializes the layerwise reload once; each chunk then loads
-    its weights raw; `finish_weight_update` finalizes once over the whole weight set.
+    `skyrl_start_weight_update` initializes the layerwise reload once; each chunk then loads
+    its weights raw; `skyrl_finish_weight_update` finalizes once over the whole weight set.
     A per-chunk `reload_weights` is the wrong approach: it re-finalizes on every call
     and restores layers absent from that chunk, corrupting a multi-chunk sync.
     """
@@ -57,7 +80,14 @@ class LayerwiseReloadWorkerMixin:
     model_config: "ModelConfig"
     device: torch.device
 
-    def start_weight_update(self, is_checkpoint_format: bool = True) -> None:
+    # NOTE: named with a `skyrl_` prefix to avoid colliding with vLLM's own
+    # Worker.start_weight_update / finish_weight_update (added in vllm-project/vllm
+    # #39212, merge e3b65a5, shipped in vLLM 0.22.0+). vLLM injects the
+    # worker-extension class as a *base* of Worker and asserts the extension
+    # defines no attribute already present on Worker, so same-named methods abort
+    # engine init. The skyrl_-prefixed variants keep SkyRL's IPC weight-sync path
+    # (and the MoE set_current_vllm_config wrapping) intact alongside vLLM's native API.
+    def skyrl_start_weight_update(self, is_checkpoint_format: bool = True) -> None:
         """
         Prepare the model for a new weight update.
 
@@ -74,10 +104,18 @@ def start_weight_update(self, is_checkpoint_format: bool = True) -> None:
         """
         if getattr(self, "_skyrl_weight_update_active", False):
             raise RuntimeError(
-                "start_weight_update called while a weight update is "
-                "already active. Call finish_weight_update first."
+                "skyrl_start_weight_update called while a weight update is "
+                "already active. Call skyrl_finish_weight_update first."
             )
 
+        # Ensure the get_numel_loaded patch is in effect before layerwise
+        # reload runs.
+        global _PATCHED_LAYERWISE_NUMEL_LOADED
+        if not _PATCHED_LAYERWISE_NUMEL_LOADED:
+            # use patched version, based on https://github.com/vllm-project/vllm/pull/44814
+            patch_numel_loaded()
+            _PATCHED_LAYERWISE_NUMEL_LOADED = True
+
         if is_checkpoint_format:
             # Lazy import: vllm is a Linux-only optional dependency, so this module stays importable on macOS / CI.
             from vllm.config import set_current_vllm_config
@@ -92,7 +130,7 @@ def start_weight_update(self, is_checkpoint_format: bool = True) -> None:
         self._skyrl_is_checkpoint_format = is_checkpoint_format
         self._skyrl_weight_update_active = True
 
-    def finish_weight_update(self) -> None:
+    def skyrl_finish_weight_update(self) -> None:
         """
         Finalize the current weight update.
 
@@ -101,7 +139,7 @@ def finish_weight_update(self) -> None:
         Must be called after all update_weights_ipc calls are done.
         """
         if not getattr(self, "_skyrl_weight_update_active", False):
-            raise RuntimeError("start_weight_update must be called before finish_weight_update.")
+            raise RuntimeError("skyrl_start_weight_update must be called before skyrl_finish_weight_update.")
 
         if self._skyrl_is_checkpoint_format:
             # Lazy import: vllm is a Linux-only optional dependency, so this module stays importable on macOS / CI.
 
@@ -5,7 +5,7 @@
 enables chunked weight updates from training to inference using the
 start/update/finish lifecycle:
 
-    start_weight_update   ->  one or more update_weights_ipc  ->  finish_weight_update
+    skyrl_start_weight_update   ->  one or more update_weights_ipc  ->  skyrl_finish_weight_update
 
 This separates the layerwise reload initialization/finalization from individual
 chunk transfers, allowing weights to be sent in bounded-memory chunks rather
@@ -40,9 +40,9 @@ class NewInferenceWorkerWrap(LayerwiseReloadWorkerMixin):
     vLLM worker extension for chunked weight sync (new inference path).
 
     Provides a three-phase weight update protocol via collective_rpc:
-        1. start_weight_update: Prepare model for receiving weights
+        1. skyrl_start_weight_update: Prepare model for receiving weights
         2. update_weights_ipc: Receive and load one chunk of weights
-        3. finish_weight_update: Finalize the model after all chunks
+        3. skyrl_finish_weight_update: Finalize the model after all chunks
 
     Attributes accessed from the host GPUWorker (via mixin inheritance):
         self.weight_transfer_engine
@@ -70,7 +70,7 @@ def update_weights_ipc(self, update_info: dict) -> None:
                 - ipc_handles_pickled: b64(pickle({gpu_uuid: (func, args)}))
         """
         if not getattr(self, "_skyrl_weight_update_active", False):
-            raise RuntimeError("start_weight_update must be called before update_weights_ipc.")
+            raise RuntimeError("skyrl_start_weight_update must be called before update_weights_ipc.")
 
         if self.weight_transfer_engine is None:
             raise RuntimeError(
@@ -141,7 +141,7 @@ def update_weights_nccl(self, update_info: dict) -> None:
         https://github.com/vllm-project/vllm/pull/42577
         """
         if not getattr(self, "_skyrl_weight_update_active", False):
-            raise RuntimeError("start_weight_update must be called before update_weights_nccl.")
+            raise RuntimeError("skyrl_start_weight_update must be called before update_weights_nccl.")
 
         if self.weight_transfer_engine is None:
             raise RuntimeError(
 
@@ -1118,7 +1118,7 @@ async def start_weight_update(
         """
         Start a new chunked weight update via /collective_rpc.
 
-        Calls the NewInferenceWorkerWrap.start_weight_update method on all
+        Calls the NewInferenceWorkerWrap.skyrl_start_weight_update method on all
         workers. For checkpoint-format weights this initializes layerwise
         reload. Must be called before any update_weights_ipc calls.
 
@@ -1132,7 +1132,7 @@ async def start_weight_update(
         return await self._call_all_servers(
             "/collective_rpc",
             {
-                "method": "start_weight_update",
+                "method": "skyrl_start_weight_update",
                 "kwargs": {"is_checkpoint_format": is_checkpoint_format},
             },
         )
@@ -1145,8 +1145,8 @@ async def update_weights_ipc(
         Send a single weight chunk via /collective_rpc.
 
         Calls NewInferenceWorkerWrap.update_weights_ipc on all workers.
-        Can be called multiple times between start_weight_update and
-        finish_weight_update.
+        Can be called multiple times between skyrl_start_weight_update and
+        skyrl_finish_weight_update.
 
         Args:
             update_info: Dict with backend-specific update info (names,
@@ -1196,15 +1196,15 @@ async def finish_weight_update(self) -> Dict[str, Any]:
         """
         Finish the current chunked weight update via /collective_rpc.
 
-        Calls NewInferenceWorkerWrap.finish_weight_update on all workers.
+        Calls NewInferenceWorkerWrap.skyrl_finish_weight_update on all workers.
         For checkpoint-format weights, runs layerwise postprocessing.
 
         Returns:
             Dict mapping server_url to response.
         """
         return await self._call_all_servers(
             "/collective_rpc",
-            {"method": "finish_weight_update"},
+            {"method": "skyrl_finish_weight_update"},
         )
 
     async def load_lora_adapter(