280280 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
281281 export SGLANG_ENABLE_SPEC_V2=1
282282 export HCCL_BUFFSIZE=650
283- export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=12
283+ export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=4
284284 export TASK_QUEUE_ENABLE=0
285285 export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1
286286 export HCCL_SOCKET_IFNAME=xxx
287287 export GLOO_SOCKET_IFNAME=xxx
288288 python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode decode --host ${D_IP[$i]} \
289- --port 8001 --trust-remote-code --dist-init-addr DIP1:5000 --nnodes 2 --node-rank $i --tp-size 32 --dp-size 16 \
289+ --port 8001 --trust-remote-code --dist-init-addr DIP1:5000 --nnodes 2 --node-rank $i --tp-size 32 --dp-size 8 \
290290 --mem-fraction-static 0.75 --max-running-requests 32 --attention-backend ascend --device npu --quantization modelslim \
291291 --moe-a2a-backend deepep --enable-dp-attention --deepep-mode low_latency --enable-dp-lm-head --moe-dense-tp 1 \
292- --cuda-graph-bs 4 --disaggregation-transfer-backend ascend --watchdog-timeout 9000 --context-length 8192 \
292+ --cuda-graph-bs 2 4 6 --disaggregation-transfer-backend ascend --watchdog-timeout 9000 --context-length 8192 \
293293 --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
294294 --tokenizer-worker-num 4 --prefill-round-robin-balance --disable-shared-experts-fusion --dtype bfloat16 \
295295 --load-balance-method decode_round_robin
@@ -1769,7 +1769,7 @@ LOCAL_HOST2=`hostname -I|awk -F " " '{print$2}'`
17691769echo " ${LOCAL_HOST1} "
17701770echo " ${LOCAL_HOST2} "
17711771
1772- export HCCL_BUFFSIZE=1600
1772+ export HCCL_BUFFSIZE=2100
17731773export HCCL_SOCKET_IFNAME=xxx
17741774export GLOO_SOCKET_IFNAME=xxx
17751775export HCCL_OP_EXPANSION_MODE=" AIV"
@@ -1867,8 +1867,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
18671867
18681868export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16
18691869
1870- export DEEP_NORMAL_MODE_USE_INT8_QUANT=1
1871-
18721870MODEL_PATH=xxx
18731871
18741872export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600
@@ -1895,16 +1893,16 @@ do
18951893 export SGLANG_ENABLE_SPEC_V2=1
18961894 export SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE=1
18971895
1898- python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode decode \
1896+ python -m sglang.launch_server --model-path ${MODEL_PATH} \
18991897 --host 127.0.0.1 --port 7439 --trust-remote-code \
19001898 --nnodes 2 --node-rank $i --tp-size 32 --dp-size 32 --mem-fraction-static 0.8 --max-running-requests 768 \
19011899 --attention-backend ascend --device npu --quantization modelslim --enable-dp-attention \
1902- --moe-a2a-backend ascend_fuseep --cuda-graph-bs 6 8 10 12 18 24 \
1900+ --moe-a2a-backend deepep --deepep-mode auto --cuda-graph-bs 6 8 10 12 18 24 \
19031901 --dist-init-addr 141.61.105.131:5000 --chunked-prefill-size 32768 --max-prefill-tokens 458880 \
19041902 --speculative-algorithm EAGLE3 --speculative-draft-model-path xxx \
19051903 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
1906- --disaggregation-transfer-backend ascend -- watchdog-timeout 9000 --context-length 8192 \
1907- --prefill-round-robin-balance -- enable-dp-lm-head --dtype bfloat16 --tokenizer-worker-num 4
1904+ --watchdog-timeout 9000 --context-length 8192 \
1905+ --enable-dp-lm-head --dtype bfloat16
19081906 NODE_RANK=$i
19091907 break
19101908 fi
@@ -2230,11 +2228,10 @@ python -m sglang.launch_server --model-path $MODEL_PATH \
22302228 --attention-backend ascend --device npu \
22312229 --max-running-requests 32 \
22322230 --disable-radix-cache \
2233- --base-gpu-id 4 \
22342231 --speculative-algorithm EAGLE3 --speculative-draft-model-path xxx \
22352232 --speculative-num-steps 4 --speculative-eagle-topk 1 --speculative-num-draft-tokens 5 \
22362233 --chunked-prefill-size -1 --max-prefill-tokens 65536 \
2237- --tp-size 8 --mem-fraction-static 0.72 --cuda-graph-bs 1 4 6 12 18 24 30 32 --dtype bfloat1
2234+ --tp-size 8 --mem-fraction-static 0.72 --cuda-graph-bs 1 4 6 12 18 24 30 32 --dtype bfloat16
22382235
22392236```
22402237
@@ -2443,10 +2440,10 @@ python -m sglang.launch_server --model-path $MODEL_PATH \
24432440 --attention-backend ascend --device npu --quantization modelslim \
24442441 --max-running-requests 78 \
24452442 --disable-radix-cache --speculative-draft-model-quantization unquant \
2446- --chunked-prefill-size -1 --max-prefill-tokens 65536 \
2443+ --chunked-prefill-size -1 --max-prefill-tokens 49152 \
24472444 --speculative-algorithm EAGLE3 --speculative-draft-model-path xxx \
24482445 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
2449- --tp-size 4 --mem-fraction-static 0.72 --cuda-graph-bs 16 32 64 68 72 78 --dtype bfloat16
2446+ --tp-size 4 --mem-fraction-static 0.7 --cuda-graph-bs 16 32 64 68 72 78 --dtype bfloat16
24502447```
24512448
24522449#### Benchmark
@@ -3046,7 +3043,7 @@ python -m sglang.launch_server --model-path $MODEL_PATH \
30463043--max-running-requests 80 --context-length 8192 --dtype bfloat16 \
30473044--chunked-prefill-size 28672 --max-prefill-tokens 458880 \
30483045--disable-radix-cache --moe-a2a-backend deepep --deepep-mode auto --enable-dp-attention --enable-dp-lm-head \
3049- --tp 16 --dp-size 4 --mem-fraction-static 0.7 --cuda-graph-bs 16 20
3046+ --tp 16 --dp-size 4 --mem-fraction-static 0.7 --cuda-graph-bs 16 20 24
30503047```
30513048
30523049#### Benchmark
@@ -3136,6 +3133,7 @@ python -m sglang.launch_server \
31363133 --host 127.0.0.1 \
31373134 --port 6699 \
31383135 --tp-size 4 \
3136+ --device npu \
31393137 --attention-backend ascend \
31403138 --mem-fraction-static 0.685 \
31413139 --max-running-requests 80 \
0 commit comments