Skip to content

Commit c7b1fe4

Browse files
authored
[AMD/ROCM] ATOM support for new models: GLM5 (#1009)
* atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> * atom glm 5 fp8 on mi355x Signed-off-by: seungrokj <seungrok.jung@amd.com> --------- Signed-off-by: seungrokj <seungrok.jung@amd.com>
1 parent 5d037dd commit c7b1fe4

File tree

4 files changed

+112
-1
lines changed

4 files changed

+112
-1
lines changed

.github/configs/amd-master.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,24 @@ glm5-fp8-mi355x-sglang:
239239
search-space:
240240
- { tp: 8, conc-start: 4, conc-end: 64 }
241241

242+
glm5-fp8-mi355x-atom:
243+
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post
244+
model: zai-org/GLM-5-FP8
245+
model-prefix: glm5
246+
runner: mi355x
247+
precision: fp8
248+
framework: atom
249+
multinode: false
250+
seq-len-configs:
251+
- isl: 1024
252+
osl: 1024
253+
search-space:
254+
- { tp: 8, conc-start: 4, conc-end: 256 }
255+
- isl: 8192
256+
osl: 1024
257+
search-space:
258+
- { tp: 8, conc-start: 4, conc-end: 256 }
259+
242260
kimik2.5-int4-mi355x-vllm:
243261
image: vllm/vllm-openai-rocm:v0.18.0
244262
model: moonshotai/Kimi-K2.5
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
MODEL \
7+
TP \
8+
CONC \
9+
ISL \
10+
OSL \
11+
RANDOM_RANGE_RATIO \
12+
RESULT_FILENAME \
13+
EP_SIZE \
14+
DP_ATTENTION
15+
16+
if [[ -n "$SLURM_JOB_ID" ]]; then
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
fi
19+
20+
echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
21+
22+
SERVER_LOG=/workspace/server.log
23+
PORT=${PORT:-8888}
24+
25+
export OMP_NUM_THREADS=1
26+
27+
# Calculate max-model-len based on ISL and OSL
28+
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
29+
CALCULATED_MAX_MODEL_LEN=""
30+
else
31+
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
32+
fi
33+
34+
if [ "$EP_SIZE" -gt 1 ]; then
35+
EP=" --enable-expert-parallel"
36+
else
37+
EP=" "
38+
fi
39+
40+
# Start GPU monitoring (power, temperature, clocks every second)
41+
start_gpu_monitor
42+
43+
set -x
44+
pip install -U transformers
45+
python3 -m atom.entrypoints.openai_server \
46+
--model $MODEL \
47+
--server-port $PORT \
48+
-tp $TP \
49+
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
50+
--default-chat-template-kwargs '{"enable_thinking": false}' \
51+
--trust-remote-code \
52+
> $SERVER_LOG 2>&1 &
53+
54+
SERVER_PID=$!
55+
56+
# Wait for server to be ready
57+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
58+
59+
export PYTHONDONTWRITEBYTECODE=1
60+
run_benchmark_serving \
61+
--model "$MODEL" \
62+
--port "$PORT" \
63+
--backend vllm \
64+
--input-len "$ISL" \
65+
--output-len "$OSL" \
66+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
67+
--num-prompts "$((CONC * 10))" \
68+
--max-concurrency "$CONC" \
69+
--result-filename "$RESULT_FILENAME" \
70+
--result-dir /workspace/ \
71+
--trust-remote-code
72+
73+
# After throughput, run evaluation only if RUN_EVAL is true
74+
if [ "${RUN_EVAL}" = "true" ]; then
75+
run_eval --framework lm-eval --port "$PORT"
76+
append_lm_eval_summary
77+
fi
78+
79+
# Stop GPU monitoring
80+
stop_gpu_monitor
81+
set +x

perf-changelog.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,6 +1266,12 @@
12661266
- "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
12671267
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001
12681268

1269+
- config-keys:
1270+
- glm5-fp8-mi355x-atom
1271+
description:
1272+
- "GLM5 FP8 configs added for MI355X ATOM"
1273+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009
1274+
12691275
- config-keys:
12701276
- kimik2.5-fp4-gb200-dynamo-vllm
12711277
description:

runners/launch_mi355x-amds.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,10 +181,16 @@ else
181181

182182
export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm"
183183

184+
if [[ "$FRAMEWORK" == "atom" ]]; then
185+
SLRUM_HOME_MOUNT=""
186+
else
187+
SLRUM_HOME_MOUNT=" --container-mount-home "
188+
fi
189+
184190
srun --jobid=$JOB_ID \
185191
--container-image=$SQUASH_FILE \
186192
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
187-
--container-mount-home \
193+
$SLRUM_HOME_MOUNT \
188194
--container-writable \
189195
--container-workdir=/workspace/ \
190196
--no-container-entrypoint --export=ALL \

0 commit comments

Comments
 (0)