Skip to content

Commit 9318e41

Browse files
author
Samuel Shen
committed
push to fix daily benchmarking
1 parent feb7482 commit 9318e41

18 files changed

+705
-24
lines changed

2-serving-engines/common/cleanup-all-baselines.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ else
152152
echo "nvidia-smi not found, skipping GPU cleanup"
153153
fi
154154

155-
# 2. Port 30080 Cleanup
155+
# 2. Port 8000,8001,8002,8003,8004,8005,8006,8007,30080 Cleanup
156156
echo "2. Cleaning up port 30080..."
157157
# Kill kubectl port-forward processes
158158
pkill -f "kubectl port-forward.*30080" 2>/dev/null || true
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#!/bin/bash
2+
3+
echo "VIRTUAL_ENV is: $VIRTUAL_ENV"
4+
5+
# should be launched from 2-serving-engines/flat/choose-and-deploy.sh
6+
7+
# Check if vllm command is available
8+
if ! command -v vllm &> /dev/null; then
9+
echo "ERROR: vllm command not found in PATH" >&2
10+
echo "Please ensure vLLM is installed and accessible:" >&2
11+
echo " pip install vllm" >&2
12+
echo "Or activate the appropriate virtual environment" >&2
13+
echo "Current PATH: $PATH" >&2
14+
echo "Python location: $(which python3 2>/dev/null || echo 'not found')" >&2
15+
exit 1
16+
fi
17+
18+
NUM_INSTANCES=4
19+
20+
21+
# Find N free ports starting from START_PORT
22+
find_free_ports() {
23+
local start=$1
24+
local count=$2
25+
local port=$start
26+
local free_ports=()
27+
28+
while [ "${#free_ports[@]}" -lt "$count" ]; do
29+
if ! lsof -iTCP:$port -sTCP:LISTEN &>/dev/null; then
30+
free_ports+=($port)
31+
fi
32+
((port++))
33+
done
34+
35+
echo "${free_ports[@]}"
36+
}
37+
38+
find_free_gpus() {
39+
local count=$1
40+
local free_gpus=()
41+
42+
local total_gpus
43+
total_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
44+
45+
for ((i=0; i<total_gpus; i++)); do
46+
has_process=$(nvidia-smi --query-compute-apps=gpu_uuid --format=csv,noheader | grep -c "$(nvidia-smi --query-gpu=uuid --format=csv,noheader -i $i)" || true)
47+
if [[ "$has_process" -eq 0 ]]; then
48+
free_gpus+=("$i")
49+
fi
50+
if [[ "${#free_gpus[@]}" -ge "$count" ]]; then
51+
break
52+
fi
53+
done
54+
55+
if [[ "${#free_gpus[@]}" -lt "$count" ]]; then
56+
echo "ERROR: Only found ${#free_gpus[@]} free GPUs, need $count" >&2
57+
exit 1
58+
fi
59+
60+
echo "${free_gpus[@]}"
61+
}
62+
63+
# Get 4 free ports starting from 8000
64+
free_ports=($(find_free_ports 8000 "$NUM_INSTANCES"))
65+
free_gpus=($(find_free_gpus "$NUM_INSTANCES"))
66+
67+
echo "Using ports: ${free_ports[*]}"
68+
echo "Using GPUs: ${free_gpus[*]}"
69+
70+
if [ "${#free_ports[@]}" -ne "$NUM_INSTANCES" ]; then
71+
echo "ERROR: Only found ${#free_ports[@]} free ports, need $NUM_INSTANCES" >&2
72+
exit 1
73+
fi
74+
75+
if [ "${#free_gpus[@]}" -ne "$NUM_INSTANCES" ]; then
76+
echo "ERROR: Only found ${#free_gpus[@]} free GPUs, need $NUM_INSTANCES" >&2
77+
exit 1
78+
fi
79+
80+
81+
for i in $(seq 0 $((NUM_INSTANCES - 1))); do
82+
port="${free_ports[$i]}"
83+
gpu="${free_gpus[$i]}"
84+
log_file="vllm_${port}.log"
85+
86+
echo "Launching vLLM on port $port with GPU $gpu..."
87+
CUDA_VISIBLE_DEVICES="$gpu" \
88+
LMCACHE_CONFIG_FILE="configs/cpu-offload.yaml" \
89+
nohup vllm serve \
90+
meta-llama/Llama-3.1-8B-Instruct \
91+
--max-model-len 32000 \
92+
--port "$port" \
93+
> "$log_file" 2>&1 &
94+
done
95+
96+
# do a trick here where we alternate querying each port with v1/models
97+
# and we return once we get NUM_INSTANCES good responses in a row
98+
# every query, we also print out the tail of the logs
99+
echo "Waiting for all $NUM_INSTANCES engines to be ready in a row..."
100+
101+
102+
ready_in_a_row=0
103+
i=0
104+
while true; do
105+
port="${free_ports[$((i % NUM_INSTANCES))]}"
106+
log_file="vllm_${port}.log"
107+
108+
echo "⏳ Checking port $port..."
109+
if curl -s http://localhost:$port/v1/models > /dev/null 2>&1; then
110+
echo "✅ Port $port responded OK"
111+
((ready_in_a_row++))
112+
else
113+
echo "❌ Port $port not ready. Resetting counter."
114+
ready_in_a_row=0
115+
fi
116+
117+
echo "↪ Log tail for port $port:"
118+
tail -n 5 "$log_file" || echo "(no log yet)"
119+
echo ""
120+
121+
if [[ "$ready_in_a_row" -ge "$NUM_INSTANCES" ]]; then
122+
echo "🎉 All $NUM_INSTANCES engines responded successfully in a row"
123+
break
124+
fi
125+
126+
sleep 2
127+
((i++))
128+
done
129+
130+
port_arg=$(IFS=, ; echo "${free_ports[*]}")
131+
132+
nohup python routers/round-robin-router.py --ports "$port_arg" &

2-serving-engines/flat/basic-lmcache/run-llama8B.sh

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,3 @@ fi
1414
echo "Starting vLLM serve with LMCache integration on port 30080..."
1515
echo "vLLM location: $(which vllm)"
1616

17-
LMCACHE_CHUNK_SIZE=256 \
18-
LMCACHE_LOCAL_CPU=True \
19-
LMCACHE_MAX_LOCAL_CPU_SIZE=60.0 \
20-
LMCACHE_USE_EXPERIMENTAL=True \
21-
vllm serve \
22-
meta-llama/Llama-3.1-8B-Instruct \
23-
--max-model-len 32000 \
24-
--port 30080 \
25-
--kv-transfer-config \
26-
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}'
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#!/bin/bash
2+
3+
echo "VIRTUAL_ENV is: $VIRTUAL_ENV"
4+
5+
# should be launched from 2-serving-engines/flat/choose-and-deploy.sh
6+
7+
# Check if vllm command is available
8+
if ! command -v vllm &> /dev/null; then
9+
echo "ERROR: vllm command not found in PATH" >&2
10+
echo "Please ensure vLLM is installed and accessible:" >&2
11+
echo " pip install vllm" >&2
12+
echo "Or activate the appropriate virtual environment" >&2
13+
echo "Current PATH: $PATH" >&2
14+
echo "Python location: $(which python3 2>/dev/null || echo 'not found')" >&2
15+
exit 1
16+
fi
17+
18+
NUM_INSTANCES=4
19+
20+
21+
# Find N free ports starting from START_PORT
22+
find_free_ports() {
23+
local start=$1
24+
local count=$2
25+
local port=$start
26+
local free_ports=()
27+
28+
while [ "${#free_ports[@]}" -lt "$count" ]; do
29+
if ! lsof -iTCP:$port -sTCP:LISTEN &>/dev/null; then
30+
free_ports+=($port)
31+
fi
32+
((port++))
33+
done
34+
35+
echo "${free_ports[@]}"
36+
}
37+
38+
find_free_gpus() {
39+
local count=$1
40+
local free_gpus=()
41+
42+
local total_gpus
43+
total_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
44+
45+
for ((i=0; i<total_gpus; i++)); do
46+
has_process=$(nvidia-smi --query-compute-apps=gpu_uuid --format=csv,noheader | grep -c "$(nvidia-smi --query-gpu=uuid --format=csv,noheader -i $i)" || true)
47+
if [[ "$has_process" -eq 0 ]]; then
48+
free_gpus+=("$i")
49+
fi
50+
if [[ "${#free_gpus[@]}" -ge "$count" ]]; then
51+
break
52+
fi
53+
done
54+
55+
if [[ "${#free_gpus[@]}" -lt "$count" ]]; then
56+
echo "ERROR: Only found ${#free_gpus[@]} free GPUs, need $count" >&2
57+
exit 1
58+
fi
59+
60+
echo "${free_gpus[@]}"
61+
}
62+
63+
# Get 4 free ports starting from 8000
64+
free_ports=($(find_free_ports 8000 "$NUM_INSTANCES"))
65+
free_gpus=($(find_free_gpus "$NUM_INSTANCES"))
66+
67+
echo "Using ports: ${free_ports[*]}"
68+
echo "Using GPUs: ${free_gpus[*]}"
69+
70+
if [ "${#free_ports[@]}" -ne "$NUM_INSTANCES" ]; then
71+
echo "ERROR: Only found ${#free_ports[@]} free ports, need $NUM_INSTANCES" >&2
72+
exit 1
73+
fi
74+
75+
if [ "${#free_gpus[@]}" -ne "$NUM_INSTANCES" ]; then
76+
echo "ERROR: Only found ${#free_gpus[@]} free GPUs, need $NUM_INSTANCES" >&2
77+
exit 1
78+
fi
79+
80+
81+
for i in $(seq 0 $((NUM_INSTANCES - 1))); do
82+
port="${free_ports[$i]}"
83+
gpu="${free_gpus[$i]}"
84+
log_file="vllm_${port}.log"
85+
86+
echo "Launching vLLM on port $port with GPU $gpu..."
87+
CUDA_VISIBLE_DEVICES="$gpu" \
88+
nohup vllm serve \
89+
meta-llama/Llama-3.1-8B-Instruct \
90+
--max-model-len 32000 \
91+
--port "$port" \
92+
> "$log_file" 2>&1 &
93+
done
94+
95+
# do a trick here where we alternate querying each port with v1/models
96+
# and we return once we get NUM_INSTANCES good responses in a row
97+
# every query, we also print out the tail of the logs
98+
echo "Waiting for all $NUM_INSTANCES engines to be ready in a row..."
99+
100+
101+
ready_in_a_row=0
102+
i=0
103+
while true; do
104+
port="${free_ports[$((i % NUM_INSTANCES))]}"
105+
log_file="vllm_${port}.log"
106+
107+
echo "⏳ Checking port $port..."
108+
if curl -s http://localhost:$port/v1/models > /dev/null 2>&1; then
109+
echo "✅ Port $port responded OK"
110+
((ready_in_a_row++))
111+
else
112+
echo "❌ Port $port not ready. Resetting counter."
113+
ready_in_a_row=0
114+
fi
115+
116+
echo "↪ Log tail for port $port:"
117+
tail -n 5 "$log_file" || echo "(no log yet)"
118+
echo ""
119+
120+
if [[ "$ready_in_a_row" -ge "$NUM_INSTANCES" ]]; then
121+
echo "🎉 All $NUM_INSTANCES engines responded successfully in a row"
122+
break
123+
fi
124+
125+
sleep 2
126+
((i++))
127+
done
128+
129+
port_arg=$(IFS=, ; echo "${free_ports[*]}")
130+
131+
nohup python routers/round-robin-router.py --ports "$port_arg" &

0 commit comments

Comments
 (0)