Skip to content

Commit 4c9ac85

Browse files
authored
[NPU] fix command in npu best practice (sgl-project#16576)
1 parent fb5b71d commit 4c9ac85

1 file changed

Lines changed: 13 additions & 15 deletions

File tree

docs/platforms/ascend_npu_best_practice.md

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -280,16 +280,16 @@ do
280280
export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
281281
export SGLANG_ENABLE_SPEC_V2=1
282282
export HCCL_BUFFSIZE=650
283-
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=12
283+
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=4
284284
export TASK_QUEUE_ENABLE=0
285285
export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1
286286
export HCCL_SOCKET_IFNAME=xxx
287287
export GLOO_SOCKET_IFNAME=xxx
288288
python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode decode --host ${D_IP[$i]} \
289-
--port 8001 --trust-remote-code --dist-init-addr DIP1:5000 --nnodes 2 --node-rank $i --tp-size 32 --dp-size 16 \
289+
--port 8001 --trust-remote-code --dist-init-addr DIP1:5000 --nnodes 2 --node-rank $i --tp-size 32 --dp-size 8 \
290290
--mem-fraction-static 0.75 --max-running-requests 32 --attention-backend ascend --device npu --quantization modelslim \
291291
--moe-a2a-backend deepep --enable-dp-attention --deepep-mode low_latency --enable-dp-lm-head --moe-dense-tp 1 \
292-
--cuda-graph-bs 4 --disaggregation-transfer-backend ascend --watchdog-timeout 9000 --context-length 8192 \
292+
--cuda-graph-bs 2 4 6 --disaggregation-transfer-backend ascend --watchdog-timeout 9000 --context-length 8192 \
293293
--speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
294294
--tokenizer-worker-num 4 --prefill-round-robin-balance --disable-shared-experts-fusion --dtype bfloat16 \
295295
--load-balance-method decode_round_robin
@@ -1769,7 +1769,7 @@ LOCAL_HOST2=`hostname -I|awk -F " " '{print$2}'`
17691769
echo "${LOCAL_HOST1}"
17701770
echo "${LOCAL_HOST2}"
17711771

1772-
export HCCL_BUFFSIZE=1600
1772+
export HCCL_BUFFSIZE=2100
17731773
export HCCL_SOCKET_IFNAME=xxx
17741774
export GLOO_SOCKET_IFNAME=xxx
17751775
export HCCL_OP_EXPANSION_MODE="AIV"
@@ -1867,8 +1867,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
18671867

18681868
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16
18691869

1870-
export DEEP_NORMAL_MODE_USE_INT8_QUANT=1
1871-
18721870
MODEL_PATH=xxx
18731871

18741872
export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600
@@ -1895,16 +1893,16 @@ do
18951893
export SGLANG_ENABLE_SPEC_V2=1
18961894
export SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE=1
18971895

1898-
python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode decode \
1896+
python -m sglang.launch_server --model-path ${MODEL_PATH} \
18991897
--host 127.0.0.1 --port 7439 --trust-remote-code \
19001898
--nnodes 2 --node-rank $i --tp-size 32 --dp-size 32 --mem-fraction-static 0.8 --max-running-requests 768 \
19011899
--attention-backend ascend --device npu --quantization modelslim --enable-dp-attention \
1902-
--moe-a2a-backend ascend_fuseep --cuda-graph-bs 6 8 10 12 18 24 \
1900+
--moe-a2a-backend deepep --deepep-mode auto --cuda-graph-bs 6 8 10 12 18 24 \
19031901
--dist-init-addr 141.61.105.131:5000 --chunked-prefill-size 32768 --max-prefill-tokens 458880 \
19041902
--speculative-algorithm EAGLE3 --speculative-draft-model-path xxx \
19051903
--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
1906-
--disaggregation-transfer-backend ascend --watchdog-timeout 9000 --context-length 8192 \
1907-
--prefill-round-robin-balance --enable-dp-lm-head --dtype bfloat16 --tokenizer-worker-num 4
1904+
--watchdog-timeout 9000 --context-length 8192 \
1905+
--enable-dp-lm-head --dtype bfloat16
19081906
NODE_RANK=$i
19091907
break
19101908
fi
@@ -2230,11 +2228,10 @@ python -m sglang.launch_server --model-path $MODEL_PATH \
22302228
--attention-backend ascend --device npu \
22312229
--max-running-requests 32 \
22322230
--disable-radix-cache \
2233-
--base-gpu-id 4 \
22342231
--speculative-algorithm EAGLE3 --speculative-draft-model-path xxx \
22352232
--speculative-num-steps 4 --speculative-eagle-topk 1 --speculative-num-draft-tokens 5 \
22362233
--chunked-prefill-size -1 --max-prefill-tokens 65536 \
2237-
--tp-size 8 --mem-fraction-static 0.72 --cuda-graph-bs 1 4 6 12 18 24 30 32 --dtype bfloat1
2234+
--tp-size 8 --mem-fraction-static 0.72 --cuda-graph-bs 1 4 6 12 18 24 30 32 --dtype bfloat16
22382235

22392236
```
22402237

@@ -2443,10 +2440,10 @@ python -m sglang.launch_server --model-path $MODEL_PATH \
24432440
--attention-backend ascend --device npu --quantization modelslim \
24442441
--max-running-requests 78 \
24452442
--disable-radix-cache --speculative-draft-model-quantization unquant \
2446-
--chunked-prefill-size -1 --max-prefill-tokens 65536 \
2443+
--chunked-prefill-size -1 --max-prefill-tokens 49152 \
24472444
--speculative-algorithm EAGLE3 --speculative-draft-model-path xxx \
24482445
--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
2449-
--tp-size 4 --mem-fraction-static 0.72 --cuda-graph-bs 16 32 64 68 72 78 --dtype bfloat16
2446+
--tp-size 4 --mem-fraction-static 0.7 --cuda-graph-bs 16 32 64 68 72 78 --dtype bfloat16
24502447
```
24512448

24522449
#### Benchmark
@@ -3046,7 +3043,7 @@ python -m sglang.launch_server --model-path $MODEL_PATH \
30463043
--max-running-requests 80 --context-length 8192 --dtype bfloat16 \
30473044
--chunked-prefill-size 28672 --max-prefill-tokens 458880 \
30483045
--disable-radix-cache --moe-a2a-backend deepep --deepep-mode auto --enable-dp-attention --enable-dp-lm-head \
3049-
--tp 16 --dp-size 4 --mem-fraction-static 0.7 --cuda-graph-bs 16 20
3046+
--tp 16 --dp-size 4 --mem-fraction-static 0.7 --cuda-graph-bs 16 20 24
30503047
```
30513048

30523049
#### Benchmark
@@ -3136,6 +3133,7 @@ python -m sglang.launch_server \
31363133
--host 127.0.0.1 \
31373134
--port 6699 \
31383135
--tp-size 4 \
3136+
--device npu \
31393137
--attention-backend ascend \
31403138
--mem-fraction-static 0.685 \
31413139
--max-running-requests 80 \

0 commit comments

Comments
 (0)