Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
6da81c4
add
ywang96 Apr 24, 2026
d3b958b
update
ywang96 Apr 24, 2026
af67085
fix dynamo
ywang96 Apr 24, 2026
951248d
1p1d
ywang96 Apr 24, 2026
daa1e4a
fix
ywang96 Apr 24, 2026
1bf11fb
add
ywang96 Apr 24, 2026
f330771
add
ywang96 Apr 24, 2026
466bb99
add
ywang96 Apr 24, 2026
6f4e65c
add
ywang96 Apr 24, 2026
6573922
add dsv4 tokenizer
richardhuo-nv Apr 24, 2026
cc50dc3
Set DeepSeek V4 SA-Bench tokenizer
alec-flowers Apr 24, 2026
e6eecd5
Add DeepSeek V4 tokenizer mode for SA-Bench
alec-flowers Apr 24, 2026
12e0b61
Refresh DeepSeek V4 offload recipes
alec-flowers Apr 24, 2026
61ec64e
update
ywang96 Apr 24, 2026
d60e3f1
Pin Dynamo commit for DeepSeek V4 recipes
alec-flowers Apr 24, 2026
ef5c889
Keep only DeepSeek V4 offload recipe
alec-flowers Apr 24, 2026
f8c41b4
Add DeepSeek V4 GB200 decode TEP recipe
alec-flowers Apr 24, 2026
edd2b6d
update pareto
hjjq Apr 25, 2026
38c0cbf
Add DeepSeek V4 aggregate GB200 and GB300 recipes
alec-flowers Apr 25, 2026
0fe49ba
Fix DeepSeek V4 recipe health checks
alec-flowers Apr 26, 2026
1e9cbf6
add (#78)
ywang96 Apr 26, 2026
5ebf140
Sanitize DeepSeek V4 recipes
alec-flowers Apr 26, 2026
9242af3
Remove explicit warmup count from DeepSeek V4 recipes
alec-flowers Apr 26, 2026
d5148e6
Add low-latency/mid-curve config
kyleliang-nv Apr 26, 2026
44e899e
Use supported benchmark runner for DeepSeek V4 disagg recipes
alec-flowers Apr 26, 2026
be6c195
Set SA-Bench options for DeepSeek V4 disagg recipes
alec-flowers Apr 26, 2026
fa73f0a
Remove GB300 aggregate DeepSeek V4 recipes
alec-flowers Apr 26, 2026
6fda96c
Pin vLLM image for new GB200 disagg recipes
alec-flowers Apr 26, 2026
24f3845
Remove GB200 aggregate DeepSeek V4 recipes
alec-flowers Apr 26, 2026
5116dac
Retry Dynamo source clone during install
alec-flowers Apr 26, 2026
d55b594
Add one-sided vLLM setup script
alec-flowers Apr 26, 2026
667a947
Add vllm-disagg GB200 8k1k max-tput configs
kyleliang-nv Apr 26, 2026
25bf05d
smaller sa set
alec-flowers Apr 26, 2026
0060f85
Install ai-dynamo wheel for DSV4 vLLM recipes
alec-flowers Apr 26, 2026
ee91188
Install ai-dynamo runtime with wheel path
alec-flowers Apr 26, 2026
e5b7684
Add GB300 vLLM-disagg 8k1k low-latency configs
kyleliang-nv Apr 26, 2026
1b8f02d
Add GB300 vLLM-disagg 8k1k mid-curve configs
kyleliang-nv Apr 26, 2026
3df0c65
Add GB300 vLLM-disagg 8k1k max-tput configs
kyleliang-nv Apr 26, 2026
65a37e5
Use python3 for ai-dynamo wheel install
alec-flowers Apr 26, 2026
d2a22cd
Add low concurrencies to DSV4 SA 1p8d recipe
alec-flowers Apr 26, 2026
680c048
Use Dynamo version for DSV4 wheel staging
alec-flowers Apr 26, 2026
ac52ab2
Revert "Use Dynamo version for DSV4 wheel staging"
alec-flowers Apr 26, 2026
1624e9a
Use wheel version for DSV4 Dynamo staging
alec-flowers Apr 26, 2026
8546708
Require exact Dynamo wheel versions
alec-flowers Apr 27, 2026
6ead346
add numactl install to vLLM prologue
hjjq Apr 27, 2026
72f91ef
add pr84 patches
hjjq Apr 28, 2026
2f89dc9
update 1s-nvl patch to reflect latest state of PR
hjjq Apr 29, 2026
8717863
Revert "update 1s-nvl patch to reflect latest state of PR"
hjjq Apr 29, 2026
76ff18f
try fix
hjjq Apr 29, 2026
79bc17f
Add GB300 vLLM-disagg DSv4-pro 8k1k mid-curve/low-latency configs
kyleliang-nv Apr 30, 2026
72dbf1c
add HT, move around
hjjq Apr 30, 2026
81e4c77
Fix job name
kyleliang-nv Apr 30, 2026
758becd
Fix Slurm job Python environment isolation
alec-flowers May 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ configs/etcdctl
configs/*.whl
configs/*.deb
configs/*.tar.gz
configs/wheels/

.ruff_cache/
*.egg-info/
Expand Down
90 changes: 90 additions & 0 deletions configs/install-ai-dynamo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

set -euo pipefail

DYNAMO_VERSION="${DYNAMO_VERSION:-}"

if [ -z "${DYNAMO_VERSION}" ]; then
echo "ERROR: DYNAMO_VERSION must be set for ai-dynamo wheel install" >&2
exit 1
fi

DYNAMO_PACKAGE="ai-dynamo==${DYNAMO_VERSION}"
DYNAMO_RUNTIME_PACKAGE="ai-dynamo-runtime==${DYNAMO_VERSION}"
DYNAMO_WHEEL_NAME="${DYNAMO_WHEEL_NAME:-ai_dynamo-${DYNAMO_VERSION}-py3-none-any.whl}"
DYNAMO_RUNTIME_WHEEL_PATTERN="${DYNAMO_RUNTIME_WHEEL_PATTERN:-ai_dynamo_runtime-${DYNAMO_VERSION}-*.whl}"
DYNAMO_WHEEL_DIRS="${DYNAMO_WHEEL_DIRS:-/configs/wheels /configs}"
PYTHON_BIN="${PYTHON_BIN:-}"

if [ -z "${PYTHON_BIN}" ]; then
if command -v python3 >/dev/null 2>&1; then
PYTHON_BIN="python3"
elif command -v python >/dev/null 2>&1; then
PYTHON_BIN="python"
else
echo "ERROR: neither python3 nor python found in PATH" >&2
exit 127
fi
fi

if "${PYTHON_BIN}" - <<PY
import importlib.metadata
import sys

wanted = "${DYNAMO_VERSION}"
packages = ("ai-dynamo", "ai-dynamo-runtime")
for package in packages:
try:
installed = importlib.metadata.version(package)
except importlib.metadata.PackageNotFoundError:
sys.exit(1)
if installed != wanted:
sys.exit(1)

import dynamo.llm # noqa: F401

sys.exit(0)
PY
then
echo "ai-dynamo and ai-dynamo-runtime ${DYNAMO_VERSION} already installed"
exit 0
fi

find_wheel() {
local pattern="$1"
local wheel_dir
for wheel_dir in ${DYNAMO_WHEEL_DIRS}; do
[ -d "${wheel_dir}" ] || continue
find "${wheel_dir}" -maxdepth 1 -type f -name "${pattern}" -print -quit
done
}

dynamo_wheel="$(find_wheel "${DYNAMO_WHEEL_NAME}")"
runtime_wheel="$(find_wheel "${DYNAMO_RUNTIME_WHEEL_PATTERN}")"

find_links_args=()
for wheel_dir in ${DYNAMO_WHEEL_DIRS}; do
[ -d "${wheel_dir}" ] || continue
find_links_args+=(--find-links "${wheel_dir}")
done

if [ -n "${dynamo_wheel}" ] && [ -n "${runtime_wheel}" ]; then
echo "Installing ai-dynamo-runtime and ai-dynamo ${DYNAMO_VERSION} from local wheels"
"${PYTHON_BIN}" -m pip install \
--pre \
--no-deps \
--no-index \
"${find_links_args[@]}" \
"${DYNAMO_RUNTIME_PACKAGE}" \
"${DYNAMO_PACKAGE}"
else
echo "ERROR: exact ai-dynamo wheels for ${DYNAMO_VERSION} were not found in ${DYNAMO_WHEEL_DIRS}" >&2
echo "ERROR: expected ${DYNAMO_WHEEL_NAME} and ${DYNAMO_RUNTIME_WHEEL_PATTERN}" >&2
exit 1
fi

"${PYTHON_BIN}" - <<'PY'
import dynamo.llm # noqa: F401
PY
169 changes: 169 additions & 0 deletions configs/patches/vllm_cumem_expandable_segments_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""
Patch vLLM's CuMemAllocator to be compatible with PyTorch expandable
segments by temporarily toggling the allocator setting around the memory
pool context (sleep mode), instead of hard-asserting at __init__ time.

Backports vllm-project/vllm#40812 ("Auto-disable expandable_segments
around cumem memory pool"). Without this patch, setting
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True together with
enable-sleep-mode causes vLLM to abort during CuMemAllocator
construction; with this patch, expandable segments stay on for normal
allocations and are flipped off only for the duration of
use_memory_pool().

Reference: https://github.com/vllm-project/vllm/pull/40812
Affected file: vllm/device_allocator/cumem.py
"""

import sys
from pathlib import Path

TARGET = Path("/usr/local/lib/python3.12/dist-packages/vllm/device_allocator/cumem.py")

# Idempotency: the new use_memory_pool body introduces this exact line.
MARKER = 'expandable_was_enabled = "expandable_segments:True" in conf'

# --- Hunk 1: drop the __init__ assertion -------------------------------------

INIT_OLD = (
" def __init__(self):\n"
' conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
' assert "expandable_segments:True" not in conf, (\n'
' "Expandable segments are not compatible with memory pool. "\n'
' "Please track https://github.com/pytorch/pytorch/issues/147851 "\n'
' "for the latest updates."\n'
" )\n"
"\n"
" self.pointer_to_data: dict[int, AllocationData] = {}\n"
)

INIT_NEW = (
" def __init__(self):\n"
" self.pointer_to_data: dict[int, AllocationData] = {}\n"
)

# --- Hunk 2: wrap use_memory_pool body in try/finally + toggle ---------------

POOL_OLD = (
" assert isinstance(tag, str)\n"
"\n"
" old_tag = self.current_tag\n"
" self.current_tag = tag\n"
" with use_memory_pool_with_allocator(\n"
" self.python_malloc_callback, self.python_free_callback\n"
" ) as data:\n"
" # start to hit another PyTorch bug in PyTorch 2.6,\n"
" # possibly because of gc-related issue w.r.t. the allocator and\n"
" # the memory pool.\n"
" # to avoid the issue, we keep a reference of the data.\n"
" # see https://github.com/pytorch/pytorch/issues/146431 .\n"
" self.allocator_and_pools[tag] = data\n"
" yield\n"
" # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
" # when using pluggable allocator, see\n"
" # https://github.com/pytorch/pytorch/issues/145168 .\n"
" # if we have some memory allocated and then freed,\n"
" # the memory will not be released, e.g. in online quantization,\n"
" # where the model is created in higher precision, and then\n"
" # quantized in lower precision.\n"
" # Find all unused allocations and manually release them.\n"
" # TODO: we should expose `empty_cache` method in the memory pool.\n"
" # TODO: ask for help from PyTorch team to expose this method.\n"
" allocations = data[0].snapshot()\n"
" for allocation in allocations:\n"
" if allocation[\"allocated_size\"] == 0:\n"
" handle = self._python_free_callback(allocation[\"address\"])\n"
" unmap_and_release(handle)\n"
" self.current_tag = old_tag\n"
)

POOL_NEW = (
" assert isinstance(tag, str)\n"
"\n"
" # Expandable segments are incompatible with the memory pool used for\n"
" # sleep mode (see https://github.com/pytorch/pytorch/issues/147851).\n"
" # If the user has enabled expandable segments via\n"
" # PYTORCH_CUDA_ALLOC_CONF, temporarily disable them for the duration\n"
" # of the memory pool context and restore on exit.\n"
' conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
' expandable_was_enabled = "expandable_segments:True" in conf\n'
" if expandable_was_enabled:\n"
' torch.cuda.memory._set_allocator_settings("expandable_segments:False")\n'
"\n"
" old_tag = self.current_tag\n"
" self.current_tag = tag\n"
" try:\n"
" with use_memory_pool_with_allocator(\n"
" self.python_malloc_callback, self.python_free_callback\n"
" ) as data:\n"
" # start to hit another PyTorch bug in PyTorch 2.6,\n"
" # possibly because of gc-related issue w.r.t. the allocator\n"
" # and the memory pool.\n"
" # to avoid the issue, we keep a reference of the data.\n"
" # see https://github.com/pytorch/pytorch/issues/146431 .\n"
" self.allocator_and_pools[tag] = data\n"
" yield\n"
" # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
" # when using pluggable allocator, see\n"
" # https://github.com/pytorch/pytorch/issues/145168 .\n"
" # if we have some memory allocated and then freed,\n"
" # the memory will not be released, e.g. in online\n"
" # quantization, where the model is created in higher\n"
" # precision, and then quantized in lower precision.\n"
" # Find all unused allocations and manually release them.\n"
" # TODO: we should expose `empty_cache` method in the memory\n"
" # pool.\n"
" # TODO: ask for help from PyTorch team to expose this method.\n"
" allocations = data[0].snapshot()\n"
" for allocation in allocations:\n"
" if allocation[\"allocated_size\"] == 0:\n"
" handle = self._python_free_callback(allocation[\"address\"])\n"
" unmap_and_release(handle)\n"
" finally:\n"
" self.current_tag = old_tag\n"
" if expandable_was_enabled:\n"
' torch.cuda.memory._set_allocator_settings("expandable_segments:True")\n'
)

PATCHES = [
("CuMemAllocator.__init__ assertion removal", INIT_OLD, INIT_NEW),
("CuMemAllocator.use_memory_pool toggle", POOL_OLD, POOL_NEW),
]


def main():
if not TARGET.exists():
print(f"[vllm-cumem-expandable-fix] Target not found: {TARGET}", file=sys.stderr)
sys.exit(1)

content = TARGET.read_text()
if MARKER in content:
print("[vllm-cumem-expandable-fix] Already patched, skipping.", file=sys.stderr)
return

new_content = content
for name, old, new in PATCHES:
count = new_content.count(old)
if count == 0:
print(
f"[vllm-cumem-expandable-fix] Anchor for {name!r} not found. "
"vLLM version may have drifted; inspect cumem.py.",
file=sys.stderr,
)
sys.exit(1)
if count > 1:
print(
f"[vllm-cumem-expandable-fix] Anchor for {name!r} is ambiguous "
f"({count} matches); refusing to patch.",
file=sys.stderr,
)
sys.exit(1)
new_content = new_content.replace(old, new, 1)
print(f"[vllm-cumem-expandable-fix] Patched {name}", file=sys.stderr)

TARGET.write_text(new_content)
print("[vllm-cumem-expandable-fix] Done.", file=sys.stderr)


if __name__ == "__main__":
main()
111 changes: 111 additions & 0 deletions configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Free original DeepSeek V4 MoE expert weights after MegaMoE finalize.

Symptom (seen on GB200 decode, EP=8, VLLM_DEEPSEEK_V4_USE_MEGA_MOE=1):
torch.OutOfMemoryError: Tried to allocate 1008.00 MiB.
GPU 0 has a total capacity of 184.31 GiB of which 381.44 MiB is free.
181.02 GiB allocated by PyTorch.
Stack ends in deep_gemm/mega/__init__.py interleave():
torch.empty_like(t).copy_(torch.stack([gate, up], dim=2).reshape(...))

Root cause: DeepseekV4MegaMoEExperts.finalize_weights() builds
self._transformed_l1_weights / _transformed_l2_weights but does NOT release
the original self.w13_weight / w2_weight / *_weight_scale parameters. Both
copies stay resident on GPU through finalize iteration, and on EP=8 the
per-rank weight footprint (~125 GiB) plus this duplication leaves no
headroom for the per-layer interleave temporaries (~1 GiB peak).

Forward path verified (deepseek_v4.py: _run_mega_moe, ~line 538-547) only
reads self._transformed_l1_weights / _transformed_l2_weights. Original
w13_weight / w2_weight / *_weight_scale are dead after finalize.

Fix (mirrors upstream PR vllm-project/vllm#40860): at the end of
finalize_weights() of each expert module, drop the four original
Parameters by assigning them to None so they are removed from the module's
_parameters dict. transform_weights_for_mega_moe allocates fresh L1 + SF
tensors and only the L2 weight aliases the original w2_weight storage --
_transformed_l2_weights still holds that reference, so the storage stays
live via refcount. PyTorch's caching allocator can then reuse the freed
storage for the NEXT layer's interleave temporaries within the same
finalize loop.

Reference: vllm/model_executor/models/deepseek_v4.py,
DeepseekV4MegaMoEExperts.finalize_weights().
"""

import sys
from pathlib import Path

TARGET = Path(
"/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py"
)

# Idempotency marker
MARKER = "srt-slurm-sa hotfix: free original MegaMoE expert weights"

# Anchor: closing of the _transformed_l1/l2 assignment in finalize_weights().
# The triple-`)` pattern is unique in the file.
OLD = (
" self._transformed_l1_weights, self._transformed_l2_weights = (\n"
" deep_gemm.transform_weights_for_mega_moe(\n"
" (self.w13_weight.data.view(torch.int8).contiguous(), w13_scale),\n"
" (self.w2_weight.data.view(torch.int8).contiguous(), w2_scale),\n"
" )\n"
" )\n"
)

NEW = (
OLD
+ " # srt-slurm-sa hotfix: free original MegaMoE expert weights.\n"
+ " # Mirrors vllm-project/vllm#40860. transform_weights_for_mega_moe\n"
+ " # allocates fresh L1 + SF tensors; only the L2 weight aliases the\n"
+ " # original w2_weight storage, but _transformed_l2_weights holds that\n"
+ " # reference, so dropping the Parameters is safe via refcount and the\n"
+ " # freed storage returns to the caching allocator in time for the next\n"
+ " # layer's interleave temp (~1 GiB).\n"
+ " self.w13_weight = None\n"
+ " self.w13_weight_scale = None\n"
+ " self.w2_weight = None\n"
+ " self.w2_weight_scale = None\n"
)


def main():
if not TARGET.exists():
print(f"[vllm-mega-moe-free-orig] Target not found: {TARGET}", file=sys.stderr)
sys.exit(1)

content = TARGET.read_text()

if MARKER in content:
print("[vllm-mega-moe-free-orig] Already patched, skipping.", file=sys.stderr)
return

count = content.count(OLD)
if count == 0:
print(
"[vllm-mega-moe-free-orig] Could not find finalize_weights anchor. "
"vLLM version may have drifted; inspect "
"DeepseekV4MegaMoEExperts.finalize_weights().",
file=sys.stderr,
)
sys.exit(1)
if count > 1:
print(
f"[vllm-mega-moe-free-orig] Anchor is ambiguous ({count} occurrences); "
"refusing to patch.",
file=sys.stderr,
)
sys.exit(1)

content = content.replace(OLD, NEW)
TARGET.write_text(content)
print(
"[vllm-mega-moe-free-orig] Freed original w13/w2 weights and scales "
"in DeepseekV4MegaMoEExperts.finalize_weights().",
file=sys.stderr,
)


if __name__ == "__main__":
main()
Loading
Loading