-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbenchmark_pipeline.sh
76 lines (70 loc) · 2.57 KB
/
benchmark_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/bash
# args
model_path=$1
dataset_path=$2
request_rate=$3
num_requests=$4
pressure_test=$5
max_concurrent_requests=$6
# parameter combination
tp_size=$7
pp_size=$8
max_num_seqs=$9
max_num_batched_tokens=${10}
scheduler_delay_factor=${11}
block_size=${12}
port=${13}
device_group=${14}
model=${15}
enable_chunked_prefill=${16}
dataset_name=${17}
enable_prefix_caching=${18}
disable_custom_all_reduce=${19}
use_v2_block_manager=${20}
OLD_IFS="$IFS"
IFS=','
read -ra ADDR <<< "$port"
IFS="$OLD_IFS"
OLD_IFS="$IFS"
IFS='#'
read -ra GPU_ADDR <<< "$device_group"
IFS="$OLD_IFS"
echo benchmark_pipeine.sh
echo model_path=$1 ${model_path}
echo dataset_path=$2 ${dataset_path}
echo request_rate=$3 ${request_rate}
echo num_requests=$4 ${num_requests}
echo pressure_test=$5 ${pressure_test}
echo max_concurrent_requests=$6 ${max_concurrent_requests}
echo tp_size=$7 ${tp_size}
echo pp_size=$8 ${pp_size}
echo max_num_seqs=$9 ${max_num_seqs}
echo max_num_batched_tokens=${10} ${max_num_batched_tokens}
echo scheduler_delay_factor=${11} ${scheduler_delay_factor}
echo block_size=${12} ${block_size}
echo port=${13} ${port}
echo device_group=${14} ${device_group}
echo model=${15} ${model}
echo enable_chunked_prefill=${16} ${enable_chunked_prefill}
echo dataset_name=${17} ${dataset_name}
echo enable_prefix_caching=${18} ${enable_prefix_caching}
echo disable_custom_all_reduce=${19} ${disable_custom_all_reduce}
echo use_v2_block_manager=${20} ${use_v2_block_manager}
echo "server start!"
for ((i=0; i<${#ADDR[@]}; i++)); do
device=${GPU_ADDR[i]}
echo device:${device}
echo addr:${ADDR[i]}
CUDA_VISIBLE_DEVICES=${device} bash run_server.sh ${model_path} ${ADDR[i]} ${tp_size} ${pp_size} ${max_num_seqs} ${max_num_batched_tokens} ${scheduler_delay_factor} ${block_size} ${enable_chunked_prefill} ${enable_prefix_caching} ${disable_custom_all_reduce} ${use_v2_block_manager}&
done
echo "finish server start!"
echo "client start!"
bash run_client.sh ${model_path} ${dataset_path} ${request_rate} ${num_requests} ${pressure_test} ${max_concurrent_requests} ${tp_size} ${pp_size} ${max_num_seqs} ${max_num_batched_tokens} ${scheduler_delay_factor} ${block_size} ${port} ${model} ${dataset_name} ${enable_chunked_prefill} ${enable_prefix_caching} ${disable_custom_all_reduce} ${use_v2_block_manager}
echo "finish client start!"
# Kill the whole process and then kill each port to ensure that the engine process has been fully killed
pgrep -f "vllm.entrypoints.api_server" | xargs kill -9
for i in "${ADDR[@]}"; do
int_port=$((i-0))
echo int_point=${int_port}
lsof -t -i:${int_port} | xargs kill -9
done