NVIDIA · ywang96 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/configs/patches/vllm_cumem_expandable_segments_fix.py b/configs/patches/vllm_cumem_expandable_segments_fix.py
@@ -0,0 +1,169 @@
+"""
+Patch vLLM's CuMemAllocator to be compatible with PyTorch expandable
+segments by temporarily toggling the allocator setting around the memory
+pool context (sleep mode), instead of hard-asserting at __init__ time.
+
+Backports vllm-project/vllm#40812 ("Auto-disable expandable_segments
+around cumem memory pool"). Without this patch, setting
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True together with
+enable-sleep-mode causes vLLM to abort during CuMemAllocator
+construction; with this patch, expandable segments stay on for normal
+allocations and are flipped off only for the duration of
+use_memory_pool().
+
+Reference: https://github.com/vllm-project/vllm/pull/40812
+Affected file: vllm/device_allocator/cumem.py
+"""
+
+import sys
+from pathlib import Path
+
+TARGET = Path("/usr/local/lib/python3.12/dist-packages/vllm/device_allocator/cumem.py")
+
+# Idempotency: the new use_memory_pool body introduces this exact line.
+MARKER = 'expandable_was_enabled = "expandable_segments:True" in conf'
+
+# --- Hunk 1: drop the __init__ assertion -------------------------------------
+
+INIT_OLD = (
+    "    def __init__(self):\n"
+    '        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
+    '        assert "expandable_segments:True" not in conf, (\n'
+    '            "Expandable segments are not compatible with memory pool. "\n'
+    '            "Please track https://github.com/pytorch/pytorch/issues/147851 "\n'
+    '            "for the latest updates."\n'
+    "        )\n"
+    "\n"
+    "        self.pointer_to_data: dict[int, AllocationData] = {}\n"
+)
+
+INIT_NEW = (
+    "    def __init__(self):\n"
+    "        self.pointer_to_data: dict[int, AllocationData] = {}\n"
+)
+
+# --- Hunk 2: wrap use_memory_pool body in try/finally + toggle ---------------
+
+POOL_OLD = (
+    "        assert isinstance(tag, str)\n"
+    "\n"
+    "        old_tag = self.current_tag\n"
+    "        self.current_tag = tag\n"
+    "        with use_memory_pool_with_allocator(\n"
+    "            self.python_malloc_callback, self.python_free_callback\n"
+    "        ) as data:\n"
+    "            # start to hit another PyTorch bug in PyTorch 2.6,\n"
+    "            # possibly because of gc-related issue w.r.t. the allocator and\n"
+    "            # the memory pool.\n"
+    "            # to avoid the issue, we keep a reference of the data.\n"
+    "            # see https://github.com/pytorch/pytorch/issues/146431 .\n"
+    "            self.allocator_and_pools[tag] = data\n"
+    "            yield\n"
+    "            # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
+    "            # when using pluggable allocator, see\n"
+    "            # https://github.com/pytorch/pytorch/issues/145168 .\n"
+    "            # if we have some memory allocated and then freed,\n"
+    "            # the memory will not be released, e.g. in online quantization,\n"
+    "            # where the model is created in higher precision, and then\n"
+    "            # quantized in lower precision.\n"
+    "            # Find all unused allocations and manually release them.\n"
+    "            # TODO: we should expose `empty_cache` method in the memory pool.\n"
+    "            # TODO: ask for help from PyTorch team to expose this method.\n"
+    "            allocations = data[0].snapshot()\n"
+    "            for allocation in allocations:\n"
+    "                if allocation[\"allocated_size\"] == 0:\n"
+    "                    handle = self._python_free_callback(allocation[\"address\"])\n"
+    "                    unmap_and_release(handle)\n"
+    "            self.current_tag = old_tag\n"
+)
+
+POOL_NEW = (
+    "        assert isinstance(tag, str)\n"
+    "\n"
+    "        # Expandable segments are incompatible with the memory pool used for\n"
+    "        # sleep mode (see https://github.com/pytorch/pytorch/issues/147851).\n"
+    "        # If the user has enabled expandable segments via\n"
+    "        # PYTORCH_CUDA_ALLOC_CONF, temporarily disable them for the duration\n"
+    "        # of the memory pool context and restore on exit.\n"
+    '        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
+    '        expandable_was_enabled = "expandable_segments:True" in conf\n'
+    "        if expandable_was_enabled:\n"
+    '            torch.cuda.memory._set_allocator_settings("expandable_segments:False")\n'
+    "\n"
+    "        old_tag = self.current_tag\n"
+    "        self.current_tag = tag\n"
+    "        try:\n"
+    "            with use_memory_pool_with_allocator(\n"
+    "                self.python_malloc_callback, self.python_free_callback\n"
+    "            ) as data:\n"
+    "                # start to hit another PyTorch bug in PyTorch 2.6,\n"
+    "                # possibly because of gc-related issue w.r.t. the allocator\n"
+    "                # and the memory pool.\n"
+    "                # to avoid the issue, we keep a reference of the data.\n"
+    "                # see https://github.com/pytorch/pytorch/issues/146431 .\n"
+    "                self.allocator_and_pools[tag] = data\n"
+    "                yield\n"
+    "                # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
+    "                # when using pluggable allocator, see\n"
+    "                # https://github.com/pytorch/pytorch/issues/145168 .\n"
+    "                # if we have some memory allocated and then freed,\n"
+    "                # the memory will not be released, e.g. in online\n"
+    "                # quantization, where the model is created in higher\n"
+    "                # precision, and then quantized in lower precision.\n"
+    "                # Find all unused allocations and manually release them.\n"
+    "                # TODO: we should expose `empty_cache` method in the memory\n"
+    "                # pool.\n"
+    "                # TODO: ask for help from PyTorch team to expose this method.\n"
+    "                allocations = data[0].snapshot()\n"
+    "                for allocation in allocations:\n"
+    "                    if allocation[\"allocated_size\"] == 0:\n"
+    "                        handle = self._python_free_callback(allocation[\"address\"])\n"
+    "                        unmap_and_release(handle)\n"
+    "        finally:\n"
+    "            self.current_tag = old_tag\n"
+    "            if expandable_was_enabled:\n"
+    '                torch.cuda.memory._set_allocator_settings("expandable_segments:True")\n'
+)
+
+PATCHES = [
+    ("CuMemAllocator.__init__ assertion removal", INIT_OLD, INIT_NEW),
+    ("CuMemAllocator.use_memory_pool toggle", POOL_OLD, POOL_NEW),
+]
+
+
+def main():
+    if not TARGET.exists():
+        print(f"[vllm-cumem-expandable-fix] Target not found: {TARGET}", file=sys.stderr)
+        sys.exit(1)
+
+    content = TARGET.read_text()
+    if MARKER in content:
+        print("[vllm-cumem-expandable-fix] Already patched, skipping.", file=sys.stderr)
+        return
+
+    new_content = content
+    for name, old, new in PATCHES:
+        count = new_content.count(old)
+        if count == 0:
+            print(
+                f"[vllm-cumem-expandable-fix] Anchor for {name!r} not found. "
+                "vLLM version may have drifted; inspect cumem.py.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        if count > 1:
+            print(
+                f"[vllm-cumem-expandable-fix] Anchor for {name!r} is ambiguous "
+                f"({count} matches); refusing to patch.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        new_content = new_content.replace(old, new, 1)
+        print(f"[vllm-cumem-expandable-fix] Patched {name}", file=sys.stderr)
+
+    TARGET.write_text(new_content)
+    print("[vllm-cumem-expandable-fix] Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py b/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
@@ -0,0 +1,111 @@
+"""
+Free original DeepSeek V4 MoE expert weights after MegaMoE finalize.
+
+Symptom (seen on GB200 decode, EP=8, VLLM_DEEPSEEK_V4_USE_MEGA_MOE=1):
+    torch.OutOfMemoryError: Tried to allocate 1008.00 MiB.
+    GPU 0 has a total capacity of 184.31 GiB of which 381.44 MiB is free.
+    181.02 GiB allocated by PyTorch.
+  Stack ends in deep_gemm/mega/__init__.py interleave():
+    torch.empty_like(t).copy_(torch.stack([gate, up], dim=2).reshape(...))
+
+Root cause: DeepseekV4MegaMoEExperts.finalize_weights() builds
+self._transformed_l1_weights / _transformed_l2_weights but does NOT release
+the original self.w13_weight / w2_weight / *_weight_scale parameters. Both
+copies stay resident on GPU through finalize iteration, and on EP=8 the
+per-rank weight footprint (~125 GiB) plus this duplication leaves no
+headroom for the per-layer interleave temporaries (~1 GiB peak).
+
+Forward path verified (deepseek_v4.py: _run_mega_moe, ~line 538-547) only
+reads self._transformed_l1_weights / _transformed_l2_weights. Original
+w13_weight / w2_weight / *_weight_scale are dead after finalize.
+
+Fix (mirrors upstream PR vllm-project/vllm#40860): at the end of
+finalize_weights() of each expert module, drop the four original
+Parameters by assigning them to None so they are removed from the module's
+_parameters dict. transform_weights_for_mega_moe allocates fresh L1 + SF
+tensors and only the L2 weight aliases the original w2_weight storage --
+_transformed_l2_weights still holds that reference, so the storage stays
+live via refcount. PyTorch's caching allocator can then reuse the freed
+storage for the NEXT layer's interleave temporaries within the same
+finalize loop.
+
+Reference: vllm/model_executor/models/deepseek_v4.py,
+DeepseekV4MegaMoEExperts.finalize_weights().
+"""
+
+import sys
+from pathlib import Path
+
+TARGET = Path(
+    "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py"
+)
+
+# Idempotency marker
+MARKER = "srt-slurm-sa hotfix: free original MegaMoE expert weights"
+
+# Anchor: closing of the _transformed_l1/l2 assignment in finalize_weights().
+# The triple-`)` pattern is unique in the file.
+OLD = (
+    "        self._transformed_l1_weights, self._transformed_l2_weights = (\n"
+    "            deep_gemm.transform_weights_for_mega_moe(\n"
+    "                (self.w13_weight.data.view(torch.int8).contiguous(), w13_scale),\n"
+    "                (self.w2_weight.data.view(torch.int8).contiguous(), w2_scale),\n"
+    "            )\n"
+    "        )\n"
+)
+
+NEW = (
+    OLD
+    + "        # srt-slurm-sa hotfix: free original MegaMoE expert weights.\n"
+    + "        # Mirrors vllm-project/vllm#40860. transform_weights_for_mega_moe\n"
+    + "        # allocates fresh L1 + SF tensors; only the L2 weight aliases the\n"
+    + "        # original w2_weight storage, but _transformed_l2_weights holds that\n"
+    + "        # reference, so dropping the Parameters is safe via refcount and the\n"
+    + "        # freed storage returns to the caching allocator in time for the next\n"
+    + "        # layer's interleave temp (~1 GiB).\n"
+    + "        self.w13_weight = None\n"
+    + "        self.w13_weight_scale = None\n"
+    + "        self.w2_weight = None\n"
+    + "        self.w2_weight_scale = None\n"
+)
+
+
+def main():
+    if not TARGET.exists():
+        print(f"[vllm-mega-moe-free-orig] Target not found: {TARGET}", file=sys.stderr)
+        sys.exit(1)
+
+    content = TARGET.read_text()
+
+    if MARKER in content:
+        print("[vllm-mega-moe-free-orig] Already patched, skipping.", file=sys.stderr)
+        return
+
+    count = content.count(OLD)
+    if count == 0:
+        print(
+            "[vllm-mega-moe-free-orig] Could not find finalize_weights anchor. "
+            "vLLM version may have drifted; inspect "
+            "DeepseekV4MegaMoEExperts.finalize_weights().",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    if count > 1:
+        print(
+            f"[vllm-mega-moe-free-orig] Anchor is ambiguous ({count} occurrences); "
+            "refusing to patch.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    content = content.replace(OLD, NEW)
+    TARGET.write_text(content)
+    print(
+        "[vllm-mega-moe-free-orig] Freed original w13/w2 weights and scales "
+        "in DeepseekV4MegaMoEExperts.finalize_weights().",
+        file=sys.stderr,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/vllm-container-deps.sh b/configs/vllm-container-deps.sh
@@ -4,6 +4,24 @@
 
 pip install msgpack
 
+# Upgrade flashinfer to v0.6.9. flashinfer-python / flashinfer-cubin only
+# publish on PyPI; flashinfer-jit-cache is CUDA-specific and only on the
+# cu130 index. --index-url replaces PyPI entirely, so split into two calls.
+# pip install --upgrade flashinfer-python==0.6.9 flashinfer-cubin==0.6.9
+# pip install --upgrade flashinfer-jit-cache==0.6.9 --index-url https://flashinfer.ai/whl/cu130
+
 if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then
     python3 /configs/patches/vllm_numa_bind_hash_fix.py
 fi
+
+if [ -f /configs/patches/vllm_nvlink_one_sided_bf16_fix.py ]; then
+    python3 /configs/patches/vllm_nvlink_one_sided_bf16_fix.py
+fi
+
+if [ -f /configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py ]; then
+    python3 /configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
+fi
+
+if [ -f /configs/patches/vllm_cumem_expandable_segments_fix.py ]; then
+    python3 /configs/patches/vllm_cumem_expandable_segments_fix.py
+fi