Faster-MoA/Faster-MoA-PD/run_servers_hetero_dmv_pd_w_experiment.sh at main · sharc-lab/Faster-MoA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# Define wait_for_server() to check for server startup
wait_for_server() {
  local port=$1
  local max_retries=3000
  local retry_interval=2
  local attempt=0

  while (( attempt < max_retries )); do
    if curl -s "http://localhost:${port}/health_generate"; then
      echo "Server on port ${port} is up!"
      return 0
    else
      echo "Waiting for server on port ${port} to start... (Attempt $((attempt + 1))/${max_retries})"
      sleep $retry_interval
      ((attempt++))
    fi
  done

  echo "Server on port ${port} failed to start after ${max_retries} attempts."
  return 1
}

mkdir -p logs
cd logs

TIMESTAMP=$(date +%Y%m%d_%H%M%S)

mkdir -p test_logs_$TIMESTAMP

cd ..

# model="meta-llama/Llama-3.1-8B-Instruct"
# model="Qwen/Qwen3-4B-Instruct-2507"
# model="Qwen/Qwen3-VL-2B-Instruct"
# model="Qwen/Qwen3-VL-4B-Instruct"
# model="Qwen/Qwen3-VL-8B-Instruct"
# model="Qwen/Qwen3-VL-32B-Instruct"

# Read model and config from environment variables
# config=${CONFIG_PATH:-"real_dataset_tb_configs.json"}

batch_size=${QUESTION_BATCH_SIZE:-1}
config=${CONFIG_PATH:-"./cfgs/hetero_dmv/hetero_math500_dmv_all.json"}
num_samples=${NUM_SAMPLES:-60}

echo "Config Path: $config"
echo "Model Path: $model"
echo "Number of Samples: $num_samples"
echo "Question Batch Size: $batch_size"

CUDA_VISIBLE_DEVICES=0 python launch_server.py \
  --log-level debug \
  --model-path "Qwen/Qwen3-VL-4B-Instruct" \
  --max-prefill-tokens 65536 \
  --mem-fraction-static 0.9 \
  --schedule-conservativeness 0.0 \
  --disaggregation-mode prefill \
  --disaggregation-bootstrap-port 9880 \
  --trust-remote-code \
  --port 30000 \
  --disaggregation-transfer-backend nixl &> logs/test_logs_$TIMESTAMP/prefill_4b.log &
CUDA_VISIBLE_DEVICES=1 python launch_server.py \
  --log-level debug \
  --model-path "Qwen/Qwen3-VL-4B-Instruct" \
  --max-prefill-tokens 65536 \
  --mem-fraction-static 0.9 \
  --schedule-conservativeness 0.0 \
  --disaggregation-mode decode \
  --trust-remote-code \
  --port 31000 \
  --disaggregation-transfer-backend nixl &> logs/test_logs_$TIMESTAMP/decode_4b.log &
CUDA_VISIBLE_DEVICES=2 python launch_server.py \
  --log-level debug \
  --model-path "Qwen/Qwen3-VL-8B-Instruct" \
  --max-prefill-tokens 65536 \
  --mem-fraction-static 0.9 \
  --schedule-conservativeness 0.0 \
  --disaggregation-mode prefill \
  --trust-remote-code \
  --disaggregation-bootstrap-port 9881 \
  --port 30001 \
  --disaggregation-transfer-backend nixl &> logs/test_logs_$TIMESTAMP/prefill_8b.log &
CUDA_VISIBLE_DEVICES=3 python launch_server.py \
  --log-level debug \
  --model-path "Qwen/Qwen3-VL-8B-Instruct" \
  --max-prefill-tokens 65536 \
  --mem-fraction-static 0.9 \
  --schedule-conservativeness 0.0 \
  --disaggregation-mode decode \
  --trust-remote-code \
  --port 31001 \
  --disaggregation-transfer-backend nixl &> logs/test_logs_$TIMESTAMP/decode_8b.log &
CUDA_VISIBLE_DEVICES=4 python launch_server.py \
  --log-level debug \
  --model-path "Qwen/Qwen3-VL-32B-Instruct" \
  --max-prefill-tokens 65536 \
  --mem-fraction-static 0.9 \
  --schedule-conservativeness 0.0 \
  --disaggregation-mode prefill \
  --trust-remote-code \
  --disaggregation-bootstrap-port 9882 \
  --port 30002 \
  --disaggregation-transfer-backend nixl &> logs/test_logs_$TIMESTAMP/prefill_32b.log &
CUDA_VISIBLE_DEVICES=5 python launch_server.py \
  --log-level debug \
  --model-path "Qwen/Qwen3-VL-32B-Instruct" \
  --max-prefill-tokens 65536 \
  --mem-fraction-static 0.9 \
  --schedule-conservativeness 0.0 \
  --disaggregation-mode decode \
  --trust-remote-code \
  --port 31002 \
  --disaggregation-transfer-backend nixl &> logs/test_logs_$TIMESTAMP/decode_32b.log &

wait_for_server 30000
wait_for_server 30001
wait_for_server 30002
wait_for_server 31000
wait_for_server 31001
wait_for_server 31002

python -m sglang_router.launch_router \
 --pd-disaggregation \
 --prefill http://127.0.0.1:30000 9880 \
 --decode http://127.0.0.1:31000 \
 --host 0.0.0.0 \
 --port 8000 \
 --prometheus-host 0.0.0.0 \
 --prometheus-port 29000 \
 --log-level debug &> logs/test_logs_$TIMESTAMP/router_4b.log &
python -m sglang_router.launch_router \
 --pd-disaggregation \
 --prefill http://127.0.0.1:30001 9881 \
 --decode http://127.0.0.1:31001 \
 --host 0.0.0.0 \
 --port 8001 \
 --prometheus-host 0.0.0.0 \
 --prometheus-port 29001 \
 --log-level debug &> logs/test_logs_$TIMESTAMP/router_8b.log &
python -m sglang_router.launch_router \
 --pd-disaggregation \
 --prefill http://127.0.0.1:30002 9882 \
 --decode http://127.0.0.1:31002 \
 --host 0.0.0.0 \
 --port 8002 \
 --prometheus-host 0.0.0.0 \
 --prometheus-port 29002 \
 --log-level debug &> logs/test_logs_$TIMESTAMP/router_32b.log &

sleep 3

CONFIG_PATH=$config python shell_router_dmv.py &> logs/test_logs_$TIMESTAMP/shell_router.log &

sleep 10

# Run the experiment script

# CONFIG_PATH=$config QUESTION_BATCH_SIZE=$batch_size NUM_SAMPLES=$num_samples RUN_PROPOSED=0 RUN_BASELINE=1 python tb_real_dataset_agent_tree_structure_dmv.py &> logs/test_logs_$TIMESTAMP/experiment_baseline.log
CONFIG_PATH=$config QUESTION_BATCH_SIZE=$batch_size NUM_SAMPLES=$num_samples RUN_PROPOSED=1 RUN_BASELINE=0 python tb_real_dataset_agent_tree_structure_dmv.py &> logs/test_logs_$TIMESTAMP/experiment_proposed.log

pkill -f launch_server.py
pkill -f shell_router.py
pkill -f sglang
pkill -f python

fuser -k 30000/tcp
fuser -k 30001/tcp
fuser -k 30002/tcp
fuser -k 31000/tcp
fuser -k 31001/tcp
fuser -k 31002/tcp
fuser -k 8000/tcp
fuser -k 8001/tcp
fuser -k 8002/tcp
fuser -k 8100/tcp

echo "Experiment completed. Logs are saved in logs/test_logs_$TIMESTAMP/"