Skip to content

Commit e583439

Browse files
CharlieFRuanRay Userclaude
authored
[train] Patch vLLM v0.16.0 sleep mode to properly free model weights (NovaSky-AI#1365)
vLLM v0.16.0 has a bug in `gpu_worker.py:load_model()` where context managers are chained with `and` instead of `,`: with pool_ctx and config_ctx: # BUG: only config_ctx is entered In Python, `A and B` evaluates A but returns B when A is truthy, so `pool_ctx.__enter__()` is never called. This means model weights are never tracked by CuMemAllocator and cannot be freed during sleep. Impact: In colocated training+inference setups, ~14 GiB of model weights remain in GPU after sleep, severely limiting available memory for training (e.g., reducing max context length). Fix: Monkey-patch `Worker.load_model` to use `,` (comma) which properly enters both context managers. Only applied for vLLM 0.16.x. Fixed upstream in vllm-project/vllm#32947 (included in v0.17.0+). Also adds a GPU unit test that verifies sleep frees >70% of loaded model+cache memory. The test fails without the patch and passes with it. --------- Co-authored-by: Ray User <ray@anyscale.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d03aab6 commit e583439

2 files changed

Lines changed: 153 additions & 0 deletions

File tree

skyrl/backends/skyrl_train/inference_servers/vllm_worker.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,46 @@
1818

1919
import torch
2020

21+
22+
def _patch_vllm_sleep_mode():
23+
"""Fix vLLM v0.16.0 bug: gpu_worker.load_model() uses `and` instead of `,`
24+
between context managers, causing CuMemAllocator pool to never be entered
25+
for weight allocations. This means model weights are never tracked by the
26+
allocator and cannot be freed during sleep.
27+
28+
In Python, `with A and B:` only enters B (A is evaluated but __enter__ is
29+
never called). The fix is `with A, B:` which enters both.
30+
31+
Fixed upstream in https://github.com/vllm-project/vllm/pull/32947 (v0.17.0+).
32+
This monkey-patch applies only to vLLM 0.16.x.
33+
"""
34+
try:
35+
import vllm
36+
37+
if not vllm.__version__.startswith("0.16"):
38+
return
39+
40+
import os
41+
42+
from vllm.config import set_current_vllm_config
43+
from vllm.v1.worker.gpu_worker import Worker
44+
45+
def _patched_load_model(self) -> None:
46+
eep_scale_up = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1"
47+
# Use comma (,) instead of `and` to properly enter BOTH context managers
48+
with (
49+
self._maybe_get_memory_pool_context(tag="weights"),
50+
set_current_vllm_config(self.vllm_config),
51+
):
52+
self.model_runner.load_model(eep_scale_up=eep_scale_up)
53+
54+
Worker.load_model = _patched_load_model
55+
except Exception as e:
56+
warnings.warn(f"Failed to patch vLLM sleep mode: {e}")
57+
58+
59+
_patch_vllm_sleep_mode()
60+
2161
# Path to this worker extension class for use in CLI args (derived from module path)
2262
VLLM_WORKER_EXTENSION_CLS = f"{__name__}.WorkerWrap"
2363

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
"""
2+
Test that vLLM sleep mode properly frees model weights from GPU memory.
3+
4+
This validates the SkyRL monkey-patch for a vLLM v0.16.0 bug where
5+
gpu_worker.load_model() uses `with A and B:` instead of `with A, B:`,
6+
causing CuMemAllocator to never track model weights.
7+
8+
The test FAILS without the patch (model weights not freed) and
9+
PASSES with it (model weights properly freed via CuMemAllocator).
10+
11+
Fixed upstream: https://github.com/vllm-project/vllm/pull/32947 (v0.17.0+)
12+
Patched in SkyRL: skyrl/backends/skyrl_train/inference_servers/vllm_worker.py
13+
14+
Requires: 1 GPU (any type with >= 4 GiB)
15+
Run: pytest tests/backends/skyrl_train/gpu/test_vllm_sleep_memory.py -v -s
16+
"""
17+
18+
import gc
19+
import os
20+
21+
import pytest
22+
import torch
23+
24+
pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires GPU")
25+
26+
TEST_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
27+
28+
29+
def _gpu_used_bytes() -> int:
30+
free, total = torch.cuda.mem_get_info()
31+
return total - free
32+
33+
34+
def _create_engine_and_measure_sleep():
35+
"""Create a vLLM engine with sleep mode, sleep it, return memory stats.
36+
37+
Returns (model_and_cache_bytes, cumem_usage_before_sleep_bytes, total_freed_bytes, residual_bytes).
38+
"""
39+
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
40+
import vllm
41+
42+
used_before = _gpu_used_bytes()
43+
44+
engine = vllm.LLM(
45+
model=TEST_MODEL,
46+
enforce_eager=True,
47+
gpu_memory_utilization=0.4,
48+
enable_sleep_mode=True,
49+
max_model_len=512,
50+
)
51+
52+
used_after_init = _gpu_used_bytes()
53+
model_and_cache = used_after_init - used_before
54+
55+
# Capture CuMemAllocator stats via the sleep log
56+
# The allocator reports how much it freed — if weights are tracked,
57+
# this should include model weights + KV cache.
58+
# If NOT tracked (bug), this only includes KV cache.
59+
from vllm.device_allocator.cumem import CuMemAllocator
60+
61+
allocator = CuMemAllocator.get_instance()
62+
cumem_usage_before_sleep = allocator.get_current_usage()
63+
64+
engine.sleep(level=2)
65+
gc.collect()
66+
torch.cuda.empty_cache()
67+
68+
used_after_sleep = _gpu_used_bytes()
69+
total_freed = used_after_init - used_after_sleep
70+
residual = used_after_sleep - used_before
71+
72+
del engine
73+
gc.collect()
74+
torch.cuda.empty_cache()
75+
76+
return model_and_cache, cumem_usage_before_sleep, total_freed, residual
77+
78+
79+
def test_sleep_frees_model_weights():
80+
"""CuMemAllocator must track model weights so sleep can free them.
81+
82+
Without the patch: CuMemAllocator only tracks KV cache (~few GiB for
83+
small models), so `cumem_usage` is small and model weights (~1 GiB)
84+
remain in GPU after sleep.
85+
86+
With the patch: CuMemAllocator tracks weights + KV cache, so
87+
`cumem_usage` includes model weights and sleep frees everything.
88+
89+
We assert that the residual GPU memory after sleep is < 30% of the
90+
loaded model+cache size. This is GPU-agnostic since it uses ratios.
91+
"""
92+
# Import the monkey-patch — this is what we're testing
93+
import skyrl.backends.skyrl_train.inference_servers.vllm_worker # noqa: F401
94+
95+
model_and_cache, cumem_usage, total_freed, residual = _create_engine_and_measure_sleep()
96+
97+
freed_pct = total_freed / model_and_cache * 100 if model_and_cache > 0 else 0
98+
residual_pct = residual / model_and_cache * 100 if model_and_cache > 0 else 0
99+
100+
print(f"\nModel+cache loaded: {model_and_cache / 1024**3:.2f} GiB")
101+
print(f"CuMemAllocator tracked: {cumem_usage / 1024**3:.2f} GiB")
102+
print(f"Total freed by sleep: {total_freed / 1024**3:.2f} GiB ({freed_pct:.0f}%)")
103+
print(f"Residual after sleep: {residual / 1024**3:.2f} GiB ({residual_pct:.0f}%)")
104+
105+
# Key assertion: residual should be small (< 30% of loaded).
106+
# Without the fix, residual is ~50%+ (model weights stuck in GPU).
107+
# With the fix, residual is ~5% (just CUDA context overhead).
108+
assert residual_pct < 30, (
109+
f"Sleep left {residual_pct:.0f}% of model+cache in GPU "
110+
f"(residual={residual / 1024**3:.2f} GiB). "
111+
f"Model weights are likely NOT tracked by CuMemAllocator. "
112+
f"Ensure the vLLM sleep patch is applied."
113+
)

0 commit comments

Comments
 (0)