Skip to content

Commit 96c4803

Browse files
committed
Merge branch 'main' into feat/upgrade-amd-mi35x-docker-to-0.5.10
2 parents c7f757e + 68bf34d commit 96c4803

14 files changed

+669
-25
lines changed

.github/configs/amd-master.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,24 @@ qwen3.5-fp8-mi355x-sglang:
203203
search-space:
204204
- { tp: 8, conc-start: 4, conc-end: 64 }
205205

206+
qwen3.5-fp4-mi355x-sglang:
207+
image: lmsysorg/sglang:v0.5.10-rocm720-mi35x
208+
model: amd/Qwen3.5-397B-A17B-MXFP4
209+
model-prefix: qwen3.5
210+
runner: mi355x
211+
precision: fp4
212+
framework: sglang
213+
multinode: false
214+
seq-len-configs:
215+
- isl: 1024
216+
osl: 1024
217+
search-space:
218+
- { tp: 4, conc-start: 4, conc-end: 256 }
219+
- isl: 8192
220+
osl: 1024
221+
search-space:
222+
- { tp: 4, conc-start: 4, conc-end: 256 }
223+
206224
qwen3.5-fp8-mi300x-sglang:
207225
image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
208226
model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -239,6 +257,24 @@ glm5-fp8-mi355x-sglang:
239257
search-space:
240258
- { tp: 8, conc-start: 4, conc-end: 64 }
241259

260+
glm5-fp8-mi355x-atom:
261+
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post
262+
model: zai-org/GLM-5-FP8
263+
model-prefix: glm5
264+
runner: mi355x
265+
precision: fp8
266+
framework: atom
267+
multinode: false
268+
seq-len-configs:
269+
- isl: 1024
270+
osl: 1024
271+
search-space:
272+
- { tp: 8, conc-start: 4, conc-end: 256 }
273+
- isl: 8192
274+
osl: 1024
275+
search-space:
276+
- { tp: 8, conc-start: 4, conc-end: 256 }
277+
242278
kimik2.5-int4-mi355x-vllm:
243279
image: vllm/vllm-openai-rocm:v0.18.0
244280
model: moonshotai/Kimi-K2.5

.github/configs/nvidia-master.yaml

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1789,6 +1789,24 @@ qwen3.5-fp8-b200-sglang:
17891789
- { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
17901790
- { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
17911791

1792+
qwen3.5-fp4-b200-sglang:
1793+
image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6
1794+
model: nvidia/Qwen3.5-397B-A17B-NVFP4
1795+
model-prefix: qwen3.5
1796+
runner: b200
1797+
precision: fp4
1798+
framework: sglang
1799+
multinode: false
1800+
seq-len-configs:
1801+
- isl: 1024
1802+
osl: 1024
1803+
search-space:
1804+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 128 }
1805+
- isl: 8192
1806+
osl: 1024
1807+
search-space:
1808+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 128 }
1809+
17921810
glm5-fp8-b200-sglang:
17931811
image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448
17941812
model: zai-org/GLM-5-FP8
@@ -1819,11 +1837,13 @@ glm5-fp4-b200-sglang:
18191837
- isl: 1024
18201838
osl: 1024
18211839
search-space:
1822-
- { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
1840+
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
1841+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
18231842
- isl: 8192
18241843
osl: 1024
18251844
search-space:
1826-
- { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
1845+
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
1846+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
18271847

18281848
qwen3.5-fp8-b200-sglang-mtp:
18291849
image: lmsysorg/sglang:v0.5.9-cu130
@@ -3131,14 +3151,43 @@ minimaxm2.5-fp8-b200-vllm:
31313151
osl: 1024
31323152
search-space:
31333153
- { tp: 2, conc-start: 4, conc-end: 512 }
3134-
- { tp: 2, ep: 2, conc-start: 4, conc-end: 256 }
31353154
- { tp: 4, conc-start: 4, conc-end: 512 }
3136-
- { tp: 4, ep: 4, conc-start: 16, conc-end: 64 }
3155+
- { tp: 2, ep: 2, conc-start: 512, conc-end: 512 }
3156+
- { tp: 4, ep: 4, conc-start: 256, conc-end: 512 }
3157+
- isl: 8192
3158+
osl: 1024
3159+
search-space:
3160+
- { tp: 2, conc-start: 4, conc-end: 512 }
3161+
- { tp: 4, conc-start: 4, conc-end: 512 }
3162+
3163+
minimaxm2.5-fp4-b200-vllm:
3164+
image: vllm/vllm-openai:v0.19.0-cu130
3165+
model: nvidia/MiniMax-M2.5-NVFP4
3166+
model-prefix: minimaxm2.5
3167+
runner: b200
3168+
precision: fp4
3169+
framework: vllm
3170+
multinode: false
3171+
seq-len-configs:
3172+
- isl: 1024
3173+
osl: 1024
3174+
search-space:
3175+
- { tp: 1, conc-start: 4, conc-end: 4 }
3176+
- { tp: 2, conc-start: 4, conc-end: 512 }
3177+
- { tp: 2, ep: 2, conc-start: 128, conc-end: 256 }
3178+
- { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 }
3179+
- { tp: 4, conc-start: 4, conc-end: 512 }
3180+
- { tp: 4, ep: 4, conc-start: 32, conc-end: 128 }
3181+
- { tp: 8, conc-start: 4, conc-end: 4 }
31373182
- isl: 8192
31383183
osl: 1024
31393184
search-space:
3140-
- { tp: 2, conc-start: 4, conc-end: 256 }
3141-
- { tp: 4, conc-start: 4, conc-end: 256 }
3185+
- { tp: 1, conc-start: 4, conc-end: 32 }
3186+
- { tp: 1, conc-start: 256, conc-end: 512 }
3187+
- { tp: 2, conc-start: 4, conc-end: 512 }
3188+
- { tp: 2, ep: 2, conc-start: 128, conc-end: 512 }
3189+
- { tp: 4, conc-start: 4, conc-end: 512 }
3190+
- { tp: 8, conc-start: 4, conc-end: 4 }
31423191

31433192
gptoss-fp4-h100-vllm:
31443193
image: vllm/vllm-openai:v0.18.0

.github/workflows/e2e-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
pip install pydantic
5555
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
5656
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
57-
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
57+
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('eval-only', False)]))")
5858
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
5959
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
6060
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT

benchmarks/single_node/glm5_fp4_b200.sh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,23 +33,26 @@ fi
3333
# Start GPU monitoring (power, temperature, clocks every second)
3434
start_gpu_monitor
3535

36-
# following https://huggingface.co/nvidia/GLM-5-NVFP4#usage recipe
37-
# except using latest nightly at the time of writing
38-
# since the recommended nightly image in that recipe doesn't exist.
39-
4036
set -x
4137
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
4238
--trust-remote-code \
4339
--tensor-parallel-size=$TP \
44-
--data-parallel-size 1 --expert-parallel-size 1 \
45-
--tool-call-parser glm47 \
46-
--reasoning-parser glm45 \
40+
--data-parallel-size 1 --expert-parallel-size $EP_SIZE \
41+
--disable-radix-cache \
4742
--quantization modelopt_fp4 \
48-
--cuda-graph-max-bs $CONC --max-running-requests $CONC \
49-
--mem-fraction-static 0.80 \
50-
--chunked-prefill-size 131072 \
43+
--kv-cache-dtype fp8_e4m3 \
44+
--nsa-decode-backend trtllm \
45+
--nsa-prefill-backend trtllm \
46+
--moe-runner-backend flashinfer_trtllm \
47+
--enable-flashinfer-allreduce-fusion \
48+
--cuda-graph-max-bs 256 \
49+
--max-prefill-tokens 32768 \
50+
--chunked-prefill-size 32768 \
51+
--mem-fraction-static 0.9 \
5152
--stream-interval 30 \
52-
--model-loader-extra-config '{"enable_multithread_load": true}' $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
53+
--scheduler-recv-interval 10 \
54+
--tokenizer-worker-num 6 \
55+
--tokenizer-path $MODEL $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
5356

5457
SERVER_PID=$!
5558

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
MODEL \
7+
TP \
8+
CONC \
9+
ISL \
10+
OSL \
11+
RANDOM_RANGE_RATIO \
12+
RESULT_FILENAME \
13+
EP_SIZE \
14+
DP_ATTENTION
15+
16+
if [[ -n "$SLURM_JOB_ID" ]]; then
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
fi
19+
20+
echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
21+
22+
SERVER_LOG=/workspace/server.log
23+
PORT=${PORT:-8888}
24+
25+
export OMP_NUM_THREADS=1
26+
27+
# Calculate max-model-len based on ISL and OSL
28+
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
29+
CALCULATED_MAX_MODEL_LEN=""
30+
else
31+
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
32+
fi
33+
34+
if [ "$EP_SIZE" -gt 1 ]; then
35+
EP=" --enable-expert-parallel"
36+
else
37+
EP=" "
38+
fi
39+
40+
# Start GPU monitoring (power, temperature, clocks every second)
41+
start_gpu_monitor
42+
43+
set -x
44+
pip install -U transformers
45+
python3 -m atom.entrypoints.openai_server \
46+
--model $MODEL \
47+
--server-port $PORT \
48+
-tp $TP \
49+
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
50+
--default-chat-template-kwargs '{"enable_thinking": false}' \
51+
--trust-remote-code \
52+
> $SERVER_LOG 2>&1 &
53+
54+
SERVER_PID=$!
55+
56+
# Wait for server to be ready
57+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
58+
59+
export PYTHONDONTWRITEBYTECODE=1
60+
run_benchmark_serving \
61+
--model "$MODEL" \
62+
--port "$PORT" \
63+
--backend vllm \
64+
--input-len "$ISL" \
65+
--output-len "$OSL" \
66+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
67+
--num-prompts "$((CONC * 10))" \
68+
--max-concurrency "$CONC" \
69+
--result-filename "$RESULT_FILENAME" \
70+
--result-dir /workspace/ \
71+
--trust-remote-code
72+
73+
# After throughput, run evaluation only if RUN_EVAL is true
74+
if [ "${RUN_EVAL}" = "true" ]; then
75+
run_eval --framework lm-eval --port "$PORT"
76+
append_lm_eval_summary
77+
fi
78+
79+
# Stop GPU monitoring
80+
stop_gpu_monitor
81+
set +x
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
MODEL \
7+
TP \
8+
EP_SIZE \
9+
DP_ATTENTION \
10+
CONC \
11+
ISL \
12+
OSL \
13+
MAX_MODEL_LEN \
14+
RANDOM_RANGE_RATIO \
15+
RESULT_FILENAME
16+
17+
if [[ -n "$SLURM_JOB_ID" ]]; then
18+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
19+
fi
20+
21+
nvidia-smi
22+
23+
hf download "$MODEL"
24+
25+
SERVER_LOG=/workspace/server.log
26+
PORT=${PORT:-8888}
27+
28+
if [ "${DP_ATTENTION}" = "true" ]; then
29+
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
30+
elif [ "$EP_SIZE" -gt 1 ]; then
31+
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
32+
else
33+
PARALLEL_ARGS="--tensor-parallel-size=$TP"
34+
fi
35+
36+
if [ "${EVAL_ONLY}" = "true" ]; then
37+
setup_eval_context
38+
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
39+
fi
40+
# Start GPU monitoring (power, temperature, clocks every second)
41+
start_gpu_monitor
42+
43+
set -x
44+
vllm serve $MODEL --port $PORT \
45+
$PARALLEL_ARGS \
46+
--gpu-memory-utilization 0.90 \
47+
--max-model-len $MAX_MODEL_LEN \
48+
--kv-cache-dtype fp8 \
49+
--max-cudagraph-capture-size 2048 \
50+
--max-num-batched-tokens "$((ISL * 2 ))" \
51+
--stream-interval 20 --no-enable-prefix-caching \
52+
--trust-remote-code > $SERVER_LOG 2>&1 &
53+
54+
SERVER_PID=$!
55+
56+
# Wait for server to be ready
57+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
58+
59+
run_benchmark_serving \
60+
--model "$MODEL" \
61+
--port "$PORT" \
62+
--backend vllm \
63+
--input-len "$ISL" \
64+
--output-len "$OSL" \
65+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
66+
--num-prompts "$((CONC * 10))" \
67+
--max-concurrency "$CONC" \
68+
--result-filename "$RESULT_FILENAME" \
69+
--result-dir /workspace/ \
70+
--trust-remote-code
71+
72+
# After throughput, run evaluation only if RUN_EVAL is true
73+
if [ "${RUN_EVAL}" = "true" ]; then
74+
run_eval --framework lm-eval --port "$PORT"
75+
append_lm_eval_summary
76+
fi
77+
78+
# Stop GPU monitoring
79+
stop_gpu_monitor
80+
set +x

0 commit comments

Comments
 (0)