@@ -166,7 +166,7 @@ Qwen3.5-397B-A17B-FP8:
166166 mtp_flags : " "
167167 dp_flags : " --moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
168168 prefill :
169- mem_fraction_static : 0.65
169+ mem_fraction_static : 0.6
170170 disable_radix_cache : true
171171 dp :
172172 max_running_requests : 24
@@ -177,7 +177,7 @@ Qwen3.5-397B-A17B-FP8:
177177 chunked_prefill_size : 65536
178178 cuda_graph_bs_range : " 1-8"
179179 decode :
180- mem_fraction_static : 0.65
180+ mem_fraction_static : 0.5
181181 prefill_round_robin_balance : true
182182 dp :
183183 max_running_requests : 4096
@@ -188,9 +188,9 @@ Qwen3.5-397B-A17B-FP8:
188188 chunked_prefill_size : 262144
189189 cuda_graph_bs_range : " 1-256"
190190 no_dp :
191- max_running_requests : 8
192- chunked_prefill_size : 65536
193- cuda_graph_bs_range : " 1-8 "
191+ max_running_requests : 4
192+ chunked_prefill_size : 32768
193+ cuda_graph_bs_range : " 1-4 "
194194
195195GLM-5-FP8 :
196196 base_flags : " --decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\\\" enable_multithread_load\\\" : true, \\\" num_threads\\\" : 8}' --disable-cuda-graph"
@@ -208,7 +208,7 @@ GLM-5-FP8:
208208 chunked_prefill_size : 65536
209209 cuda_graph_bs_range : " 1-8"
210210 decode :
211- mem_fraction_static : 0.65
211+ mem_fraction_static : 0.5
212212 prefill_round_robin_balance : true
213213 dp :
214214 max_running_requests : 4096
@@ -219,8 +219,8 @@ GLM-5-FP8:
219219 chunked_prefill_size : 262144
220220 cuda_graph_bs_range : " 1-256"
221221 no_dp :
222- max_running_requests : 8
223- chunked_prefill_size : 65536
222+ max_running_requests : 4
223+ chunked_prefill_size : 32768
224224 cuda_graph_bs_range : " 1-8"
225225
226226DeepSeek-R1-0528-MXFP4-Preview :
0 commit comments