-
Notifications
You must be signed in to change notification settings - Fork 141
Expand file tree
/
Copy pathqwen3.5_bf16_mi355x.sh
More file actions
executable file
·76 lines (64 loc) · 1.85 KB
/
qwen3.5_bf16_mi355x.sh
File metadata and controls
executable file
·76 lines (64 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env bash
source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE
if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi
hf download "$MODEL"
SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768
EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
python3 -m sglang.launch_server \
--attention-backend triton \
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--ep-size $EP_SIZE \
--trust-remote-code \
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
SERVER_PID=$!
# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/
# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi
# Stop GPU monitoring
stop_gpu_monitor
set +x