NVIDIA · alec-flowers · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.gitignore b/.gitignore
@@ -47,6 +47,7 @@ configs/etcdctl
 configs/*.whl
 configs/*.deb
 configs/*.tar.gz
+configs/wheels/
 
 .ruff_cache/
 *.egg-info/

diff --git a/configs/install-ai-dynamo.sh b/configs/install-ai-dynamo.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+DYNAMO_VERSION="${DYNAMO_VERSION:-}"
+
+if [ -z "${DYNAMO_VERSION}" ]; then
+    echo "ERROR: DYNAMO_VERSION must be set for ai-dynamo wheel install" >&2
+    exit 1
+fi
+
+DYNAMO_PACKAGE="ai-dynamo==${DYNAMO_VERSION}"
+DYNAMO_RUNTIME_PACKAGE="ai-dynamo-runtime==${DYNAMO_VERSION}"
+DYNAMO_WHEEL_NAME="${DYNAMO_WHEEL_NAME:-ai_dynamo-${DYNAMO_VERSION}-py3-none-any.whl}"
+DYNAMO_RUNTIME_WHEEL_PATTERN="${DYNAMO_RUNTIME_WHEEL_PATTERN:-ai_dynamo_runtime-${DYNAMO_VERSION}-*.whl}"
+DYNAMO_WHEEL_DIRS="${DYNAMO_WHEEL_DIRS:-/configs/wheels /configs}"
+PYTHON_BIN="${PYTHON_BIN:-}"
+
+if [ -z "${PYTHON_BIN}" ]; then
+    if command -v python3 >/dev/null 2>&1; then
+        PYTHON_BIN="python3"
+    elif command -v python >/dev/null 2>&1; then
+        PYTHON_BIN="python"
+    else
+        echo "ERROR: neither python3 nor python found in PATH" >&2
+        exit 127
+    fi
+fi
+
+if "${PYTHON_BIN}" - <<PY
+import importlib.metadata
+import sys
+
+wanted = "${DYNAMO_VERSION}"
+packages = ("ai-dynamo", "ai-dynamo-runtime")
+for package in packages:
+    try:
+        installed = importlib.metadata.version(package)
+    except importlib.metadata.PackageNotFoundError:
+        sys.exit(1)
+    if installed != wanted:
+        sys.exit(1)
+
+import dynamo.llm  # noqa: F401
+
+sys.exit(0)
+PY
+then
+    echo "ai-dynamo and ai-dynamo-runtime ${DYNAMO_VERSION} already installed"
+    exit 0
+fi
+
+find_wheel() {
+    local pattern="$1"
+    local wheel_dir
+    for wheel_dir in ${DYNAMO_WHEEL_DIRS}; do
+        [ -d "${wheel_dir}" ] || continue
+        find "${wheel_dir}" -maxdepth 1 -type f -name "${pattern}" -print -quit
+    done
+}
+
+dynamo_wheel="$(find_wheel "${DYNAMO_WHEEL_NAME}")"
+runtime_wheel="$(find_wheel "${DYNAMO_RUNTIME_WHEEL_PATTERN}")"
+
+find_links_args=()
+for wheel_dir in ${DYNAMO_WHEEL_DIRS}; do
+    [ -d "${wheel_dir}" ] || continue
+    find_links_args+=(--find-links "${wheel_dir}")
+done
+
+if [ -n "${dynamo_wheel}" ] && [ -n "${runtime_wheel}" ]; then
+    echo "Installing ai-dynamo-runtime and ai-dynamo ${DYNAMO_VERSION} from local wheels"
+    "${PYTHON_BIN}" -m pip install \
+        --pre \
+        --no-deps \
+        --no-index \
+        "${find_links_args[@]}" \
+        "${DYNAMO_RUNTIME_PACKAGE}" \
+        "${DYNAMO_PACKAGE}"
+else
+    echo "ERROR: exact ai-dynamo wheels for ${DYNAMO_VERSION} were not found in ${DYNAMO_WHEEL_DIRS}" >&2
+    echo "ERROR: expected ${DYNAMO_WHEEL_NAME} and ${DYNAMO_RUNTIME_WHEEL_PATTERN}" >&2
+    exit 1
+fi
+
+"${PYTHON_BIN}" - <<'PY'
+import dynamo.llm  # noqa: F401
+PY
diff --git a/configs/patches/vllm_cumem_expandable_segments_fix.py b/configs/patches/vllm_cumem_expandable_segments_fix.py
@@ -0,0 +1,169 @@
+"""
+Patch vLLM's CuMemAllocator to be compatible with PyTorch expandable
+segments by temporarily toggling the allocator setting around the memory
+pool context (sleep mode), instead of hard-asserting at __init__ time.
+
+Backports vllm-project/vllm#40812 ("Auto-disable expandable_segments
+around cumem memory pool"). Without this patch, setting
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True together with
+enable-sleep-mode causes vLLM to abort during CuMemAllocator
+construction; with this patch, expandable segments stay on for normal
+allocations and are flipped off only for the duration of
+use_memory_pool().
+
+Reference: https://github.com/vllm-project/vllm/pull/40812
+Affected file: vllm/device_allocator/cumem.py
+"""
+
+import sys
+from pathlib import Path
+
+TARGET = Path("/usr/local/lib/python3.12/dist-packages/vllm/device_allocator/cumem.py")
+
+# Idempotency: the new use_memory_pool body introduces this exact line.
+MARKER = 'expandable_was_enabled = "expandable_segments:True" in conf'
+
+# --- Hunk 1: drop the __init__ assertion -------------------------------------
+
+INIT_OLD = (
+    "    def __init__(self):\n"
+    '        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
+    '        assert "expandable_segments:True" not in conf, (\n'
+    '            "Expandable segments are not compatible with memory pool. "\n'
+    '            "Please track https://github.com/pytorch/pytorch/issues/147851 "\n'
+    '            "for the latest updates."\n'
+    "        )\n"
+    "\n"
+    "        self.pointer_to_data: dict[int, AllocationData] = {}\n"
+)
+
+INIT_NEW = (
+    "    def __init__(self):\n"
+    "        self.pointer_to_data: dict[int, AllocationData] = {}\n"
+)
+
+# --- Hunk 2: wrap use_memory_pool body in try/finally + toggle ---------------
+
+POOL_OLD = (
+    "        assert isinstance(tag, str)\n"
+    "\n"
+    "        old_tag = self.current_tag\n"
+    "        self.current_tag = tag\n"
+    "        with use_memory_pool_with_allocator(\n"
+    "            self.python_malloc_callback, self.python_free_callback\n"
+    "        ) as data:\n"
+    "            # start to hit another PyTorch bug in PyTorch 2.6,\n"
+    "            # possibly because of gc-related issue w.r.t. the allocator and\n"
+    "            # the memory pool.\n"
+    "            # to avoid the issue, we keep a reference of the data.\n"
+    "            # see https://github.com/pytorch/pytorch/issues/146431 .\n"
+    "            self.allocator_and_pools[tag] = data\n"
+    "            yield\n"
+    "            # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
+    "            # when using pluggable allocator, see\n"
+    "            # https://github.com/pytorch/pytorch/issues/145168 .\n"
+    "            # if we have some memory allocated and then freed,\n"
+    "            # the memory will not be released, e.g. in online quantization,\n"
+    "            # where the model is created in higher precision, and then\n"
+    "            # quantized in lower precision.\n"
+    "            # Find all unused allocations and manually release them.\n"
+    "            # TODO: we should expose `empty_cache` method in the memory pool.\n"
+    "            # TODO: ask for help from PyTorch team to expose this method.\n"
+    "            allocations = data[0].snapshot()\n"
+    "            for allocation in allocations:\n"
+    "                if allocation[\"allocated_size\"] == 0:\n"
+    "                    handle = self._python_free_callback(allocation[\"address\"])\n"
+    "                    unmap_and_release(handle)\n"
+    "            self.current_tag = old_tag\n"
+)
+
+POOL_NEW = (
+    "        assert isinstance(tag, str)\n"
+    "\n"
+    "        # Expandable segments are incompatible with the memory pool used for\n"
+    "        # sleep mode (see https://github.com/pytorch/pytorch/issues/147851).\n"
+    "        # If the user has enabled expandable segments via\n"
+    "        # PYTORCH_CUDA_ALLOC_CONF, temporarily disable them for the duration\n"
+    "        # of the memory pool context and restore on exit.\n"
+    '        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
+    '        expandable_was_enabled = "expandable_segments:True" in conf\n'
+    "        if expandable_was_enabled:\n"
+    '            torch.cuda.memory._set_allocator_settings("expandable_segments:False")\n'
+    "\n"
+    "        old_tag = self.current_tag\n"
+    "        self.current_tag = tag\n"
+    "        try:\n"
+    "            with use_memory_pool_with_allocator(\n"
+    "                self.python_malloc_callback, self.python_free_callback\n"
+    "            ) as data:\n"
+    "                # start to hit another PyTorch bug in PyTorch 2.6,\n"
+    "                # possibly because of gc-related issue w.r.t. the allocator\n"
+    "                # and the memory pool.\n"
+    "                # to avoid the issue, we keep a reference of the data.\n"
+    "                # see https://github.com/pytorch/pytorch/issues/146431 .\n"
+    "                self.allocator_and_pools[tag] = data\n"
+    "                yield\n"
+    "                # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
+    "                # when using pluggable allocator, see\n"
+    "                # https://github.com/pytorch/pytorch/issues/145168 .\n"
+    "                # if we have some memory allocated and then freed,\n"
+    "                # the memory will not be released, e.g. in online\n"
+    "                # quantization, where the model is created in higher\n"
+    "                # precision, and then quantized in lower precision.\n"
+    "                # Find all unused allocations and manually release them.\n"
+    "                # TODO: we should expose `empty_cache` method in the memory\n"
+    "                # pool.\n"
+    "                # TODO: ask for help from PyTorch team to expose this method.\n"
+    "                allocations = data[0].snapshot()\n"
+    "                for allocation in allocations:\n"
+    "                    if allocation[\"allocated_size\"] == 0:\n"
+    "                        handle = self._python_free_callback(allocation[\"address\"])\n"
+    "                        unmap_and_release(handle)\n"
+    "        finally:\n"
+    "            self.current_tag = old_tag\n"
+    "            if expandable_was_enabled:\n"
+    '                torch.cuda.memory._set_allocator_settings("expandable_segments:True")\n'
+)
+
+PATCHES = [
+    ("CuMemAllocator.__init__ assertion removal", INIT_OLD, INIT_NEW),
+    ("CuMemAllocator.use_memory_pool toggle", POOL_OLD, POOL_NEW),
+]
+
+
+def main():
+    if not TARGET.exists():
+        print(f"[vllm-cumem-expandable-fix] Target not found: {TARGET}", file=sys.stderr)
+        sys.exit(1)
+
+    content = TARGET.read_text()
+    if MARKER in content:
+        print("[vllm-cumem-expandable-fix] Already patched, skipping.", file=sys.stderr)
+        return
+
+    new_content = content
+    for name, old, new in PATCHES:
+        count = new_content.count(old)
+        if count == 0:
+            print(
+                f"[vllm-cumem-expandable-fix] Anchor for {name!r} not found. "
+                "vLLM version may have drifted; inspect cumem.py.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        if count > 1:
+            print(
+                f"[vllm-cumem-expandable-fix] Anchor for {name!r} is ambiguous "
+                f"({count} matches); refusing to patch.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        new_content = new_content.replace(old, new, 1)
+        print(f"[vllm-cumem-expandable-fix] Patched {name}", file=sys.stderr)
+
+    TARGET.write_text(new_content)
+    print("[vllm-cumem-expandable-fix] Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py b/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
@@ -0,0 +1,111 @@
+"""
+Free original DeepSeek V4 MoE expert weights after MegaMoE finalize.
+
+Symptom (seen on GB200 decode, EP=8, VLLM_DEEPSEEK_V4_USE_MEGA_MOE=1):
+    torch.OutOfMemoryError: Tried to allocate 1008.00 MiB.
+    GPU 0 has a total capacity of 184.31 GiB of which 381.44 MiB is free.
+    181.02 GiB allocated by PyTorch.
+  Stack ends in deep_gemm/mega/__init__.py interleave():
+    torch.empty_like(t).copy_(torch.stack([gate, up], dim=2).reshape(...))
+
+Root cause: DeepseekV4MegaMoEExperts.finalize_weights() builds
+self._transformed_l1_weights / _transformed_l2_weights but does NOT release
+the original self.w13_weight / w2_weight / *_weight_scale parameters. Both
+copies stay resident on GPU through finalize iteration, and on EP=8 the
+per-rank weight footprint (~125 GiB) plus this duplication leaves no
+headroom for the per-layer interleave temporaries (~1 GiB peak).
+
+Forward path verified (deepseek_v4.py: _run_mega_moe, ~line 538-547) only
+reads self._transformed_l1_weights / _transformed_l2_weights. Original
+w13_weight / w2_weight / *_weight_scale are dead after finalize.
+
+Fix (mirrors upstream PR vllm-project/vllm#40860): at the end of
+finalize_weights() of each expert module, drop the four original
+Parameters by assigning them to None so they are removed from the module's
+_parameters dict. transform_weights_for_mega_moe allocates fresh L1 + SF
+tensors and only the L2 weight aliases the original w2_weight storage --
+_transformed_l2_weights still holds that reference, so the storage stays
+live via refcount. PyTorch's caching allocator can then reuse the freed
+storage for the NEXT layer's interleave temporaries within the same
+finalize loop.
+
+Reference: vllm/model_executor/models/deepseek_v4.py,
+DeepseekV4MegaMoEExperts.finalize_weights().
+"""
+
+import sys
+from pathlib import Path
+
+TARGET = Path(
+    "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py"
+)
+
+# Idempotency marker
+MARKER = "srt-slurm-sa hotfix: free original MegaMoE expert weights"
+
+# Anchor: closing of the _transformed_l1/l2 assignment in finalize_weights().
+# The triple-`)` pattern is unique in the file.
+OLD = (
+    "        self._transformed_l1_weights, self._transformed_l2_weights = (\n"
+    "            deep_gemm.transform_weights_for_mega_moe(\n"
+    "                (self.w13_weight.data.view(torch.int8).contiguous(), w13_scale),\n"
+    "                (self.w2_weight.data.view(torch.int8).contiguous(), w2_scale),\n"
+    "            )\n"
+    "        )\n"
+)
+
+NEW = (
+    OLD
+    + "        # srt-slurm-sa hotfix: free original MegaMoE expert weights.\n"
+    + "        # Mirrors vllm-project/vllm#40860. transform_weights_for_mega_moe\n"
+    + "        # allocates fresh L1 + SF tensors; only the L2 weight aliases the\n"
+    + "        # original w2_weight storage, but _transformed_l2_weights holds that\n"
+    + "        # reference, so dropping the Parameters is safe via refcount and the\n"
+    + "        # freed storage returns to the caching allocator in time for the next\n"
+    + "        # layer's interleave temp (~1 GiB).\n"
+    + "        self.w13_weight = None\n"
+    + "        self.w13_weight_scale = None\n"
+    + "        self.w2_weight = None\n"
+    + "        self.w2_weight_scale = None\n"
+)
+
+
+def main():
+    if not TARGET.exists():
+        print(f"[vllm-mega-moe-free-orig] Target not found: {TARGET}", file=sys.stderr)
+        sys.exit(1)
+
+    content = TARGET.read_text()
+
+    if MARKER in content:
+        print("[vllm-mega-moe-free-orig] Already patched, skipping.", file=sys.stderr)
+        return
+
+    count = content.count(OLD)
+    if count == 0:
+        print(
+            "[vllm-mega-moe-free-orig] Could not find finalize_weights anchor. "
+            "vLLM version may have drifted; inspect "
+            "DeepseekV4MegaMoEExperts.finalize_weights().",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    if count > 1:
+        print(
+            f"[vllm-mega-moe-free-orig] Anchor is ambiguous ({count} occurrences); "
+            "refusing to patch.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    content = content.replace(OLD, NEW)
+    TARGET.write_text(content)
+    print(
+        "[vllm-mega-moe-free-orig] Freed original w13/w2 weights and scales "
+        "in DeepseekV4MegaMoEExperts.finalize_weights().",
+        file=sys.stderr,
+    )
+
+
+if __name__ == "__main__":
+    main()