[fix] Fix silent MoE corruption in the legacy multi-chunk weight sync (NovaSky-AI#1737)

jamesbraza · claude · web-flow · commit 5a527dfc6605 · 2026-06-14T11:33:34.000-07:00
Builds on NovaSky-AI#1685, which switched the legacy `WorkerWrap.load_weights` path to `reload_weights`. With that, `load_weights` ran a self-contained `reload_weights` per chunk, so vLLM's per-call `finalize_layerwise_processing` restored every layer absent from that chunk — silently corrupting any multi-chunk weight sync into MoE gibberish after the first sync ( NovaSky-AI#1680; upstream vllm-project/vllm#42821). This PR brackets the whole sync with a single layerwise-reload initialize/finalize via a shared `LayerwiseReloadWorkerMixin`, sharing this lifecycle with the 'new' inference path (`new_inference_worker_wrap.py`). Closes NovaSky-AI#1680. --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/skyrl/backends/skyrl_train/inference_engines/inference_engine_client.py b/skyrl/backends/skyrl_train/inference_engines/inference_engine_client.py
@@ -354,6 +354,12 @@ async def init_weight_update_communicator(self, init_info: "WeightSyncInitInfo")
     async def update_named_weights(self, request: WeightUpdateRequest):
         return await self._run_on_all_engines("update_named_weights", request=request)
 
+    async def start_weight_update(self, is_checkpoint_format: bool = True):
+        return await self._run_on_all_engines("start_weight_update", is_checkpoint_format=is_checkpoint_format)
+
+    async def finish_weight_update(self):
+        return await self._run_on_all_engines("finish_weight_update")
+
     async def reset_prefix_cache(self):
         return await self._run_on_all_engines("reset_prefix_cache")
 
diff --git a/skyrl/backends/skyrl_train/inference_engines/ray_wrapped_inference_engine.py b/skyrl/backends/skyrl_train/inference_engines/ray_wrapped_inference_engine.py
@@ -69,6 +69,12 @@ async def init_weight_update_communicator(self, init_info: "WeightSyncInitInfo")
     async def update_named_weights(self, request: WeightUpdateRequest):
         return await self.inference_engine_actor.update_named_weights.remote(request)
 
+    async def start_weight_update(self, is_checkpoint_format: bool = True):
+        return await self.inference_engine_actor.start_weight_update.remote(is_checkpoint_format=is_checkpoint_format)
+
+    async def finish_weight_update(self):
+        return await self.inference_engine_actor.finish_weight_update.remote()
+
     async def teardown(self):
         return await self.inference_engine_actor.teardown.remote()
 
diff --git a/skyrl/backends/skyrl_train/inference_engines/vllm/vllm_engine.py b/skyrl/backends/skyrl_train/inference_engines/vllm/vllm_engine.py
@@ -372,6 +372,18 @@ async def _teardown_weight_receiver(self):
         engine = self._get_engine()
         return await asyncio.to_thread(engine.collective_rpc, "teardown_weight_receiver")
 
+    async def start_weight_update(self, is_checkpoint_format: bool = True):
+        engine = self._get_engine()
+        return await asyncio.to_thread(
+            engine.collective_rpc,
+            "start_weight_update",
+            args=(is_checkpoint_format,),
+        )
+
+    async def finish_weight_update(self):
+        engine = self._get_engine()
+        return await asyncio.to_thread(engine.collective_rpc, "finish_weight_update")
+
 
 class AsyncVLLMInferenceEngine(BaseVLLMInferenceEngine):
     """Asynchronous VLLM engine."""
@@ -591,6 +603,17 @@ async def _teardown_weight_receiver(self):
         engine = self._get_engine()
         return await engine.collective_rpc("teardown_weight_receiver")
 
+    async def start_weight_update(self, is_checkpoint_format: bool = True):
+        engine = self._get_engine()
+        return await engine.collective_rpc(
+            "start_weight_update",
+            args=(is_checkpoint_format,),
+        )
+
+    async def finish_weight_update(self):
+        engine = self._get_engine()
+        return await engine.collective_rpc("finish_weight_update")
+
     # ----------------------------------------
     # Methods for handling OpenAI API requests
     # ----------------------------------------
diff --git a/skyrl/backends/skyrl_train/inference_servers/layerwise_reload.py b/skyrl/backends/skyrl_train/inference_servers/layerwise_reload.py
@@ -0,0 +1,118 @@
+"""Shared vLLM layerwise-reload lifecycle for SkyRL's vLLM worker-extension classes.
+
+Provides `LayerwiseReloadWorkerMixin`, the start/finish bracket that both
+`vllm_worker.WorkerWrap` and
+`new_inference_worker_wrap.NewInferenceWorkerWrap` use to run vLLM's
+layerwise reload once per weight sync rather than once per chunk.
+"""
+
+from typing import TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+# Workaround for a vLLM layerwise-reload corruption affecting NemotronH/Mamba.
+# MambaMixer2 registers `conv_weights` as a non-persistent buffer that is a
+# view of `self.conv1d.weight.data` (shared storage). vLLM's reload code path
+# (model_executor/model_loader/reload/layerwise.py) materializes the buffer
+# into a fresh uninitialized GPU tensor and then runs
+# `kernel_conv_weights.data.copy_(fresh)` in `_copy_and_restore_kernel_tensors`.
+# Because the kernel buffer shares storage with `conv1d.weight.data`, this
+# writes garbage (NaN-bit-pattern bytes in bf16) into the conv1d weight,
+# corrupting all 23 Mamba layers after every weight sync.
+#
+# Adding "conv_weights" to vLLM's SKIP_TENSORS makes capture/restore/materialize
+# skip the buffer entirely, so the view stays intact and conv1d.weight is
+# preserved. Must be applied before `record_metadata_for_reloading` runs at
+# model construction; this module is imported by vLLM via
+# --worker-extension-cls before model init, so the import-time patch is
+# correctly ordered.
+# Remove this pending https://github.com/vllm-project/vllm/pull/42481 which should
+# be included in vLLM 0.21.0
+try:
+    # Guarded import: vllm is a Linux-only optional dependency, so this module stays importable on macOS / CI.
+    from vllm.model_executor.model_loader.reload.meta import (
+        SKIP_TENSORS as _VLLM_SKIP_TENSORS,
+    )
+
+    _VLLM_SKIP_TENSORS.add("conv_weights")
+except ImportError:
+    pass
+
+
+class LayerwiseReloadWorkerMixin:
+    """Bracket a multi-chunk weight sync with one vLLM layerwise-reload init/finalize.
+
+    `start_weight_update` initializes the layerwise reload once; each chunk then loads
+    its weights raw; `finish_weight_update` finalizes once over the whole weight set.
+    A per-chunk `reload_weights` is the wrong approach: it re-finalizes on every call
+    and restores layers absent from that chunk, corrupting a multi-chunk sync.
+    """
+
+    vllm_config: "VllmConfig"
+    model_runner: "GPUModelRunner"
+    model_config: "ModelConfig"
+    device: torch.device
+
+    def start_weight_update(self, is_checkpoint_format: bool = True) -> None:
+        """
+        Prepare the model for a new weight update.
+
+        For checkpoint-format weights, initializes the layerwise reload
+        machinery which moves layers to meta device and wraps weight loaders
+        to defer processing until all weights for each layer are loaded.
+
+        Must be called before any update_weights_ipc calls.
+
+        Args:
+            is_checkpoint_format: True if incoming weights are in checkpoint
+                format (need layerwise processing). False if weights are
+                already in kernel format (direct copy).
+        """
+        if getattr(self, "_skyrl_weight_update_active", False):
+            raise RuntimeError(
+                "start_weight_update called while a weight update is "
+                "already active. Call finish_weight_update first."
+            )
+
+        if is_checkpoint_format:
+            # Lazy import: vllm is a Linux-only optional dependency, so this module stays importable on macOS / CI.
+            from vllm.config import set_current_vllm_config
+            from vllm.model_executor.model_loader.reload import (
+                initialize_layerwise_reload,
+            )
+
+            model = self.model_runner.model
+            with set_current_vllm_config(self.vllm_config), torch.device(self.device):
+                initialize_layerwise_reload(model)
+
+        self._skyrl_is_checkpoint_format = is_checkpoint_format
+        self._skyrl_weight_update_active = True
+
+    def finish_weight_update(self) -> None:
+        """
+        Finalize the current weight update.
+
+        For checkpoint-format weights, runs layerwise postprocessing
+        (quantization repacking, attention weight processing, etc.).
+        Must be called after all update_weights_ipc calls are done.
+        """
+        if not getattr(self, "_skyrl_weight_update_active", False):
+            raise RuntimeError("start_weight_update must be called before finish_weight_update.")
+
+        if self._skyrl_is_checkpoint_format:
+            # Lazy import: vllm is a Linux-only optional dependency, so this module stays importable on macOS / CI.
+            from vllm.config import set_current_vllm_config
+            from vllm.model_executor.model_loader.reload import (
+                finalize_layerwise_reload,
+            )
+
+            model = self.model_runner.model
+            with set_current_vllm_config(self.vllm_config), torch.device(self.device):
+                finalize_layerwise_reload(model, self.model_config)
+
+        self._skyrl_weight_update_active = False
+        self._skyrl_is_checkpoint_format = True
diff --git a/skyrl/backends/skyrl_train/inference_servers/new_inference_worker_wrap.py b/skyrl/backends/skyrl_train/inference_servers/new_inference_worker_wrap.py
@@ -28,37 +28,14 @@
 
 import torch
 
-# Workaround for a vLLM layerwise-reload corruption affecting NemotronH/Mamba.
-# MambaMixer2 registers `conv_weights` as a non-persistent buffer that is a
-# view of `self.conv1d.weight.data` (shared storage). vLLM's reload code path
-# (model_executor/model_loader/reload/layerwise.py) materializes the buffer
-# into a fresh uninitialized GPU tensor and then runs
-# `kernel_conv_weights.data.copy_(fresh)` in `_copy_and_restore_kernel_tensors`.
-# Because the kernel buffer shares storage with `conv1d.weight.data`, this
-# writes garbage (NaN-bit-pattern bytes in bf16) into the conv1d weight,
-# corrupting all 23 Mamba layers after every weight sync.
-#
-# Adding "conv_weights" to vLLM's SKIP_TENSORS makes capture/restore/materialize
-# skip the buffer entirely, so the view stays intact and conv1d.weight is
-# preserved. Must be applied before `record_metadata_for_reloading` runs at
-# model construction; this module is imported by vLLM via
-# --worker-extension-cls before model init, so the import-time patch is
-# correctly ordered.
-# Remove this pending https://github.com/vllm-project/vllm/pull/42481 which should
-# be included in vLLM 0.21.0
-try:
-    from vllm.model_executor.model_loader.reload.meta import (
-        SKIP_TENSORS as _VLLM_SKIP_TENSORS,
-    )
-
-    _VLLM_SKIP_TENSORS.add("conv_weights")
-except ImportError:
-    pass
+from skyrl.backends.skyrl_train.inference_servers.layerwise_reload import (
+    LayerwiseReloadWorkerMixin,
+)
 
 VLLM_NEW_INFERENCE_WORKER_EXTENSION_CLS = f"{__name__}.NewInferenceWorkerWrap"
 
 
-class NewInferenceWorkerWrap:
+class NewInferenceWorkerWrap(LayerwiseReloadWorkerMixin):
     """
     vLLM worker extension for chunked weight sync (new inference path).
 
@@ -74,40 +51,6 @@ class NewInferenceWorkerWrap:
         self.device
     """
 
-    def start_weight_update(self, is_checkpoint_format: bool = True) -> None:
-        """
-        Prepare the model for a new weight update.
-
-        For checkpoint-format weights, initializes the layerwise reload
-        machinery which moves layers to meta device and wraps weight loaders
-        to defer processing until all weights for each layer are loaded.
-
-        Must be called before any update_weights_ipc calls.
-
-        Args:
-            is_checkpoint_format: True if incoming weights are in checkpoint
-                format (need layerwise processing). False if weights are
-                already in kernel format (direct copy).
-        """
-        if getattr(self, "_skyrl_weight_update_active", False):
-            raise RuntimeError(
-                "start_weight_update called while a weight update is "
-                "already active. Call finish_weight_update first."
-            )
-
-        if is_checkpoint_format:
-            from vllm.config import set_current_vllm_config
-            from vllm.model_executor.model_loader.reload import (
-                initialize_layerwise_reload,
-            )
-
-            model = self.model_runner.model
-            with set_current_vllm_config(self.vllm_config), torch.device(self.device):
-                initialize_layerwise_reload(model)
-
-        self._skyrl_is_checkpoint_format = is_checkpoint_format
-        self._skyrl_weight_update_active = True
-
     def update_weights_ipc(self, update_info: dict) -> None:
         """
         Receive and load a single chunk of weights.
@@ -217,27 +160,3 @@ def update_weights_nccl(self, update_info: dict) -> None:
             )
 
         torch.accelerator.synchronize()
-
-    def finish_weight_update(self) -> None:
-        """
-        Finalize the current weight update.
-
-        For checkpoint-format weights, runs layerwise postprocessing
-        (quantization repacking, attention weight processing, etc.).
-        Must be called after all update_weights_ipc calls are done.
-        """
-        if not getattr(self, "_skyrl_weight_update_active", False):
-            raise RuntimeError("start_weight_update must be called before finish_weight_update.")
-
-        if self._skyrl_is_checkpoint_format:
-            from vllm.config import set_current_vllm_config
-            from vllm.model_executor.model_loader.reload import (
-                finalize_layerwise_reload,
-            )
-
-            model = self.model_runner.model
-            with set_current_vllm_config(self.vllm_config), torch.device(self.device):
-                finalize_layerwise_reload(model, self.model_config)
-
-        self._skyrl_weight_update_active = False
-        self._skyrl_is_checkpoint_format = True
diff --git a/skyrl/backends/skyrl_train/inference_servers/vllm_worker.py b/skyrl/backends/skyrl_train/inference_servers/vllm_worker.py
@@ -18,11 +18,15 @@
 
 import torch
 
+from skyrl.backends.skyrl_train.inference_servers.layerwise_reload import (
+    LayerwiseReloadWorkerMixin,
+)
+
 # Path to this worker extension class for use in CLI args (derived from module path)
 VLLM_WORKER_EXTENSION_CLS = f"{__name__}.WorkerWrap"
 
 
-class WorkerWrap:
+class WorkerWrap(LayerwiseReloadWorkerMixin):
     """
     vLLM worker extension for SkyRL weight synchronization.
 
@@ -32,7 +36,9 @@ class WorkerWrap:
 
     Methods:
         init_weight_update_communicator: Initialize the weight receiver
-        load_weights: Receive and load weights from trainer
+        start_weight_update: Begin a sync; initialize vLLM layerwise reload once
+        load_weights: Receive and load one chunk of weights from trainer
+        finish_weight_update: End a sync; finalize vLLM layerwise reload once
         teardown_weight_receiver: Clean up weight receiver resources
     """
 
@@ -73,9 +79,15 @@ def init_weight_update_communicator(self, init_info: bytes):
 
     def load_weights(self, request: bytes) -> None:
         """
-        Load weights using the receiver.
+        Load one chunk of weights using the receiver.
 
-        This method is called via collective_rpc from the weight loader.
+        Called via collective_rpc from the weight loader, once per chunk.
+        When the sender brackets the sync with start_weight_update / finish_weight_update,
+        the chunk is loaded raw and the single finalize runs vLLM's post-load weight
+        processing exactly once over the whole weight set.
+        Without a bracket, it falls back to a self-contained reload_weights
+        (initialize + load + finalize in this one call), correct when the call
+        carries the whole model so finalize sees every layer and restores none.
 
         Args:
             request: Pickled bytes of WeightUpdateRequest.
@@ -92,8 +104,17 @@ def load_weights(self, request: bytes) -> None:
         for name, tensor in self._weight_receiver.receive_weights(request):
             weight_list.append((name, tensor))
 
+        weight_update_bracketed = getattr(self, "_skyrl_weight_update_active", False)
         with torch.device(self.device), set_current_vllm_config(self.vllm_config):
-            self.model_runner.reload_weights(weights_iterator=iter(weight_list))
+            if weight_update_bracketed:
+                self.model_runner.model.load_weights(weights=weight_list)
+            else:
+                self.model_runner.reload_weights(weights_iterator=iter(weight_list))
+
+        if weight_update_bracketed:
+            # Finish consuming IPC-backed tensors before the sender drops them on
+            # its next barrier; matches NewInferenceWorkerWrap.update_weights_ipc
+            torch.accelerator.synchronize()
 
         for weight in weight_list:
             del weight
diff --git a/skyrl/backends/skyrl_train/weight_sync/broadcast_strategy.py b/skyrl/backends/skyrl_train/weight_sync/broadcast_strategy.py
@@ -230,6 +230,12 @@ async def _send_chunks_legacy(self, chunks: Iterable[WeightChunk]) -> None:
         if rank == 0:
             assert self._model_update_group is not None, "Rank 0 must have model_update_group"
 
+        # Bracket the whole sync with one layerwise-reload initialize/finalize so
+        # per-chunk reloads don't restore non-chunk layers; see `vllm_worker.py.WorkerWrap` docs
+        if rank == 0:
+            await self._inference_client.start_weight_update(is_checkpoint_format=True)
+        torch.distributed.barrier()
+
         # All ranks iterate through chunks (weight extraction may involve collective ops)
         for chunk in chunks:
             # Only rank 0 sends request to inference engines
@@ -264,6 +270,10 @@ def broadcast_packed(t, group):
 
             torch.distributed.barrier()
 
+        if rank == 0:
+            await self._inference_client.finish_weight_update()
+        torch.distributed.barrier()
+
     def teardown(self) -> None:
         """Destroy the process group used for weight transfer."""
         if self._model_update_group is not None and isinstance(
diff --git a/skyrl/backends/skyrl_train/weight_sync/cuda_ipc_strategy.py b/skyrl/backends/skyrl_train/weight_sync/cuda_ipc_strategy.py
diff --git a/tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync.py b/tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync.py