Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions configs/patches/vllm_cumem_expandable_segments_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""
Patch vLLM's CuMemAllocator to be compatible with PyTorch expandable
segments by temporarily toggling the allocator setting around the memory
pool context (sleep mode), instead of hard-asserting at __init__ time.

Backports vllm-project/vllm#40812 ("Auto-disable expandable_segments
around cumem memory pool"). Without this patch, setting
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True together with
enable-sleep-mode causes vLLM to abort during CuMemAllocator
construction; with this patch, expandable segments stay on for normal
allocations and are flipped off only for the duration of
use_memory_pool().

Reference: https://github.com/vllm-project/vllm/pull/40812
Affected file: vllm/device_allocator/cumem.py
"""

import sys
from pathlib import Path

TARGET = Path("/usr/local/lib/python3.12/dist-packages/vllm/device_allocator/cumem.py")

# Idempotency: the new use_memory_pool body introduces this exact line.
MARKER = 'expandable_was_enabled = "expandable_segments:True" in conf'

# --- Hunk 1: drop the __init__ assertion -------------------------------------

INIT_OLD = (
" def __init__(self):\n"
' conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
' assert "expandable_segments:True" not in conf, (\n'
' "Expandable segments are not compatible with memory pool. "\n'
' "Please track https://github.com/pytorch/pytorch/issues/147851 "\n'
' "for the latest updates."\n'
" )\n"
"\n"
" self.pointer_to_data: dict[int, AllocationData] = {}\n"
)

INIT_NEW = (
" def __init__(self):\n"
" self.pointer_to_data: dict[int, AllocationData] = {}\n"
)

# --- Hunk 2: wrap use_memory_pool body in try/finally + toggle ---------------

POOL_OLD = (
" assert isinstance(tag, str)\n"
"\n"
" old_tag = self.current_tag\n"
" self.current_tag = tag\n"
" with use_memory_pool_with_allocator(\n"
" self.python_malloc_callback, self.python_free_callback\n"
" ) as data:\n"
" # start to hit another PyTorch bug in PyTorch 2.6,\n"
" # possibly because of gc-related issue w.r.t. the allocator and\n"
" # the memory pool.\n"
" # to avoid the issue, we keep a reference of the data.\n"
" # see https://github.com/pytorch/pytorch/issues/146431 .\n"
" self.allocator_and_pools[tag] = data\n"
" yield\n"
" # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
" # when using pluggable allocator, see\n"
" # https://github.com/pytorch/pytorch/issues/145168 .\n"
" # if we have some memory allocated and then freed,\n"
" # the memory will not be released, e.g. in online quantization,\n"
" # where the model is created in higher precision, and then\n"
" # quantized in lower precision.\n"
" # Find all unused allocations and manually release them.\n"
" # TODO: we should expose `empty_cache` method in the memory pool.\n"
" # TODO: ask for help from PyTorch team to expose this method.\n"
" allocations = data[0].snapshot()\n"
" for allocation in allocations:\n"
" if allocation[\"allocated_size\"] == 0:\n"
" handle = self._python_free_callback(allocation[\"address\"])\n"
" unmap_and_release(handle)\n"
" self.current_tag = old_tag\n"
)

POOL_NEW = (
" assert isinstance(tag, str)\n"
"\n"
" # Expandable segments are incompatible with the memory pool used for\n"
" # sleep mode (see https://github.com/pytorch/pytorch/issues/147851).\n"
" # If the user has enabled expandable segments via\n"
" # PYTORCH_CUDA_ALLOC_CONF, temporarily disable them for the duration\n"
" # of the memory pool context and restore on exit.\n"
' conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
' expandable_was_enabled = "expandable_segments:True" in conf\n'
" if expandable_was_enabled:\n"
' torch.cuda.memory._set_allocator_settings("expandable_segments:False")\n'
"\n"
" old_tag = self.current_tag\n"
" self.current_tag = tag\n"
" try:\n"
" with use_memory_pool_with_allocator(\n"
" self.python_malloc_callback, self.python_free_callback\n"
" ) as data:\n"
" # start to hit another PyTorch bug in PyTorch 2.6,\n"
" # possibly because of gc-related issue w.r.t. the allocator\n"
" # and the memory pool.\n"
" # to avoid the issue, we keep a reference of the data.\n"
" # see https://github.com/pytorch/pytorch/issues/146431 .\n"
" self.allocator_and_pools[tag] = data\n"
" yield\n"
" # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
" # when using pluggable allocator, see\n"
" # https://github.com/pytorch/pytorch/issues/145168 .\n"
" # if we have some memory allocated and then freed,\n"
" # the memory will not be released, e.g. in online\n"
" # quantization, where the model is created in higher\n"
" # precision, and then quantized in lower precision.\n"
" # Find all unused allocations and manually release them.\n"
" # TODO: we should expose `empty_cache` method in the memory\n"
" # pool.\n"
" # TODO: ask for help from PyTorch team to expose this method.\n"
" allocations = data[0].snapshot()\n"
" for allocation in allocations:\n"
" if allocation[\"allocated_size\"] == 0:\n"
" handle = self._python_free_callback(allocation[\"address\"])\n"
" unmap_and_release(handle)\n"
" finally:\n"
" self.current_tag = old_tag\n"
" if expandable_was_enabled:\n"
' torch.cuda.memory._set_allocator_settings("expandable_segments:True")\n'
)

PATCHES = [
("CuMemAllocator.__init__ assertion removal", INIT_OLD, INIT_NEW),
("CuMemAllocator.use_memory_pool toggle", POOL_OLD, POOL_NEW),
]


def main():
if not TARGET.exists():
print(f"[vllm-cumem-expandable-fix] Target not found: {TARGET}", file=sys.stderr)
sys.exit(1)

content = TARGET.read_text()
if MARKER in content:
print("[vllm-cumem-expandable-fix] Already patched, skipping.", file=sys.stderr)
return

new_content = content
for name, old, new in PATCHES:
count = new_content.count(old)
if count == 0:
print(
f"[vllm-cumem-expandable-fix] Anchor for {name!r} not found. "
"vLLM version may have drifted; inspect cumem.py.",
file=sys.stderr,
)
sys.exit(1)
if count > 1:
print(
f"[vllm-cumem-expandable-fix] Anchor for {name!r} is ambiguous "
f"({count} matches); refusing to patch.",
file=sys.stderr,
)
sys.exit(1)
new_content = new_content.replace(old, new, 1)
print(f"[vllm-cumem-expandable-fix] Patched {name}", file=sys.stderr)

TARGET.write_text(new_content)
print("[vllm-cumem-expandable-fix] Done.", file=sys.stderr)


if __name__ == "__main__":
main()
111 changes: 111 additions & 0 deletions configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Free original DeepSeek V4 MoE expert weights after MegaMoE finalize.

Symptom (seen on GB200 decode, EP=8, VLLM_DEEPSEEK_V4_USE_MEGA_MOE=1):
torch.OutOfMemoryError: Tried to allocate 1008.00 MiB.
GPU 0 has a total capacity of 184.31 GiB of which 381.44 MiB is free.
181.02 GiB allocated by PyTorch.
Stack ends in deep_gemm/mega/__init__.py interleave():
torch.empty_like(t).copy_(torch.stack([gate, up], dim=2).reshape(...))

Root cause: DeepseekV4MegaMoEExperts.finalize_weights() builds
self._transformed_l1_weights / _transformed_l2_weights but does NOT release
the original self.w13_weight / w2_weight / *_weight_scale parameters. Both
copies stay resident on GPU through finalize iteration, and on EP=8 the
per-rank weight footprint (~125 GiB) plus this duplication leaves no
headroom for the per-layer interleave temporaries (~1 GiB peak).

Forward path verified (deepseek_v4.py: _run_mega_moe, ~line 538-547) only
reads self._transformed_l1_weights / _transformed_l2_weights. Original
w13_weight / w2_weight / *_weight_scale are dead after finalize.

Fix (mirrors upstream PR vllm-project/vllm#40860): at the end of
finalize_weights() of each expert module, drop the four original
Parameters by assigning them to None so they are removed from the module's
_parameters dict. transform_weights_for_mega_moe allocates fresh L1 + SF
tensors and only the L2 weight aliases the original w2_weight storage --
_transformed_l2_weights still holds that reference, so the storage stays
live via refcount. PyTorch's caching allocator can then reuse the freed
storage for the NEXT layer's interleave temporaries within the same
finalize loop.

Reference: vllm/model_executor/models/deepseek_v4.py,
DeepseekV4MegaMoEExperts.finalize_weights().
"""

import sys
from pathlib import Path

TARGET = Path(
"/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py"
)

# Idempotency marker
MARKER = "srt-slurm-sa hotfix: free original MegaMoE expert weights"

# Anchor: closing of the _transformed_l1/l2 assignment in finalize_weights().
# The triple-`)` pattern is unique in the file.
OLD = (
" self._transformed_l1_weights, self._transformed_l2_weights = (\n"
" deep_gemm.transform_weights_for_mega_moe(\n"
" (self.w13_weight.data.view(torch.int8).contiguous(), w13_scale),\n"
" (self.w2_weight.data.view(torch.int8).contiguous(), w2_scale),\n"
" )\n"
" )\n"
)

NEW = (
OLD
+ " # srt-slurm-sa hotfix: free original MegaMoE expert weights.\n"
+ " # Mirrors vllm-project/vllm#40860. transform_weights_for_mega_moe\n"
+ " # allocates fresh L1 + SF tensors; only the L2 weight aliases the\n"
+ " # original w2_weight storage, but _transformed_l2_weights holds that\n"
+ " # reference, so dropping the Parameters is safe via refcount and the\n"
+ " # freed storage returns to the caching allocator in time for the next\n"
+ " # layer's interleave temp (~1 GiB).\n"
+ " self.w13_weight = None\n"
+ " self.w13_weight_scale = None\n"
+ " self.w2_weight = None\n"
+ " self.w2_weight_scale = None\n"
)


def main():
if not TARGET.exists():
print(f"[vllm-mega-moe-free-orig] Target not found: {TARGET}", file=sys.stderr)
sys.exit(1)

content = TARGET.read_text()

if MARKER in content:
print("[vllm-mega-moe-free-orig] Already patched, skipping.", file=sys.stderr)
return

count = content.count(OLD)
if count == 0:
print(
"[vllm-mega-moe-free-orig] Could not find finalize_weights anchor. "
"vLLM version may have drifted; inspect "
"DeepseekV4MegaMoEExperts.finalize_weights().",
file=sys.stderr,
)
sys.exit(1)
if count > 1:
print(
f"[vllm-mega-moe-free-orig] Anchor is ambiguous ({count} occurrences); "
"refusing to patch.",
file=sys.stderr,
)
sys.exit(1)

content = content.replace(OLD, NEW)
TARGET.write_text(content)
print(
"[vllm-mega-moe-free-orig] Freed original w13/w2 weights and scales "
"in DeepseekV4MegaMoEExperts.finalize_weights().",
file=sys.stderr,
)


if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions configs/vllm-container-deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,24 @@

pip install msgpack

# Upgrade flashinfer to v0.6.9. flashinfer-python / flashinfer-cubin only
# publish on PyPI; flashinfer-jit-cache is CUDA-specific and only on the
# cu130 index. --index-url replaces PyPI entirely, so split into two calls.
# pip install --upgrade flashinfer-python==0.6.9 flashinfer-cubin==0.6.9
# pip install --upgrade flashinfer-jit-cache==0.6.9 --index-url https://flashinfer.ai/whl/cu130

if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then
python3 /configs/patches/vllm_numa_bind_hash_fix.py
fi

if [ -f /configs/patches/vllm_nvlink_one_sided_bf16_fix.py ]; then
python3 /configs/patches/vllm_nvlink_one_sided_bf16_fix.py
fi

if [ -f /configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py ]; then
python3 /configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
fi

if [ -f /configs/patches/vllm_cumem_expandable_segments_fix.py ]; then
python3 /configs/patches/vllm_cumem_expandable_segments_fix.py
fi
Loading