-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_client.sh
75 lines (71 loc) · 2.59 KB
/
run_client.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
model_path=$1
dataset_path=$2
request_rate=$3
num_requests=$4
pressure_test=$5
max_concurrent_requests=$6
tp_size=$7
pp_size=$8
max_num_seqs=$9
max_num_batched_tokens=${10}
scheduler_delay_factor=${11}
block_size=${12}
port=${13}
model=${14}
dataset_name=${15}
enable_chunked_prefill=${16}
enable_prefix_caching=${17}
disable_custom_all_reduce=${18}
use_v2_block_manager=${19}
if [ "${pressure_test}" == "True" ]; then
additional_options="--pressure-test --max-concurrent-requests ${max_concurrent_requests}"
fi
echo run_client.sh
echo python -m clients.benchmark_serving \
--backend vllm \
--tokenizer ${model_path}\
--dataset-name ${dataset_name} \
--dataset-path ${dataset_path} \
--request-rate ${request_rate}\
--model ${model}\
--num-prompts ${num_requests}\
--save-result \
--max-num-batched-tokens ${max_num_batched_tokens}\
--max-num-seqs ${max_num_seqs}\
--scheduler-delay-factor ${scheduler_delay_factor}\
--enable-chunked-prefill ${enable_chunked_prefill}\
--tensor-parallel-size ${tp_size}\
--pipeline-parallel-size ${pp_size} \
--block-size ${block_size}\
--port ${port}\
--enable-prefix-caching ${enable_prefix_caching}\
--disable-custom-all-reduce ${disable_custom_all_reduce}\
--use-v2-block-manager ${use_v2_block_manager}\
--trust-remote-code\
--disable-tqdm\
--seed 42\
$additional_options
python -m clients.benchmark_serving \
--backend vllm \
--tokenizer ${model_path}\
--dataset-name ${dataset_name} \
--dataset-path ${dataset_path} \
--request-rate ${request_rate}\
--model ${model}\
--num-prompts ${num_requests}\
--save-result \
--max-num-batched-tokens ${max_num_batched_tokens}\
--max-num-seqs ${max_num_seqs}\
--scheduler-delay-factor ${scheduler_delay_factor}\
--enable-chunked-prefill ${enable_chunked_prefill}\
--tensor-parallel-size ${tp_size}\
--pipeline-parallel-size ${pp_size} \
--block-size ${block_size}\
--port ${port}\
--enable-prefix-caching ${enable_prefix_caching}\
--disable-custom-all-reduce ${disable_custom_all_reduce}\
--use-v2-block-manager ${use_v2_block_manager}\
--trust-remote-code\
--disable-tqdm\
--seed 42\
$additional_options