66
77from sglang .srt .server_args import ServerArgs
88
9- # Note (Ratish, Chenyang):
10-
11- # SGLang's VLM auto-sizing applies a dynamic 0.95 * factor reserve
12- # (roughly [0.8, 1.05]); Qwen3-Omni nests vision/audio configs under
13- # `thinker_config` so SGLang's VLM path never triggers for us. 0.05
14- # is a conservative linear lower-bound of that dynamic reserve; we
15- # subtract it after auto-sizing when the thinker GPU also hosts encoder
16- # stages. User-pinned mem_fraction_static bypasses this reserve.
17-
18- OMNI_ENCODER_MEM_FRACTION_STATIC_RESERVE = 0.05
19-
209
2110def build_sglang_server_args (
2211 model_path : str ,
@@ -26,10 +15,9 @@ def build_sglang_server_args(
2615 max_prefill_tokens : int = 4096 ,
2716 max_running_requests : int = 16 ,
2817 mem_fraction_static : float | None = None ,
29- auto_mem_fraction_static_reserve : float | None = None ,
3018 ** overrides : Any ,
3119) -> ServerArgs :
32- """Build ServerArgs with shared defaults for all SGLang AR engines."""
20+ """Build a SGLang ServerArgs with shared defaults for AR engines."""
3321 kwargs : dict [str , Any ] = {
3422 "model_path" : model_path ,
3523 "trust_remote_code" : True ,
@@ -45,30 +33,36 @@ def build_sglang_server_args(
4533 if mem_fraction_static is not None :
4634 kwargs ["mem_fraction_static" ] = mem_fraction_static
4735 kwargs .update (overrides )
48- server_args = ServerArgs (** kwargs )
49- _apply_auto_mem_fraction_static_reserve (
50- server_args ,
51- enabled = auto_mem_fraction_static_reserve is not None ,
52- user_mem_fraction_static = mem_fraction_static ,
53- reserve = auto_mem_fraction_static_reserve or 0.0 ,
54- )
55- return server_args
36+ return ServerArgs (** kwargs )
5637
5738
58- def _apply_auto_mem_fraction_static_reserve (
39+ def apply_encoder_mem_reserve (
5940 server_args : ServerArgs ,
60- * ,
61- enabled : bool ,
62- user_mem_fraction_static : float | None ,
63- reserve : float ,
41+ encoder_mem_reserve : float ,
6442) -> None :
65- """Subtract a caller-requested reserve from SGLang's auto-selected value."""
66- if not enabled or user_mem_fraction_static is not None :
67- return
68- if reserve <= 0 :
69- return
43+ """Subtract encoder_mem_reserve from SGLang's auto-picked mem_fraction_static.
7044
45+ # Note (Chenyang):
46+ Call this only when SGLang auto-selected mem_fraction_static —
47+ i.e. the caller did NOT pin --mem-fraction-static. When the caller
48+ pinned, that value is the whole budget and the reserve value is ignored.
49+
50+ Raises ValueError when the result would drop below 0.1 — below
51+ that, SGLang's KV allocator fails deep in the scheduler with a
52+ confusing traceback (empirically crashes ~0.08 on H200 for
53+ Qwen3-Omni-30B), so surface it at build time instead.
54+ """
55+ if encoder_mem_reserve <= 0 :
56+ return
7157 current = server_args .mem_fraction_static
7258 if current is None :
7359 return
74- server_args .mem_fraction_static = round (max (0.01 , current - reserve ), 3 )
60+ new_value = current - encoder_mem_reserve
61+ if new_value < 0.1 :
62+ raise ValueError (
63+ f"auto mem_fraction_static { current :.3f} minus encoder_mem_reserve "
64+ f"{ encoder_mem_reserve :.3f} = { new_value :.3f} is below the safe "
65+ f"floor 0.1; lower encoder_mem_reserve or pin "
66+ f"--mem-fraction-static explicitly."
67+ )
68+ server_args .mem_fraction_static = round (new_value , 3 )
0 commit comments