Skip to content

Commit bd1a02b

Browse files
michaelharrisonmaiMichael HarrisonVidhisha Balachandran
authored
LocalVLLMModel and deployment handler (#102)
This PR introduces the `LocalVLLMModel` and `LocalVLLMDeploymentHandler`, the former is a usual Model (i.e. implements generate) and the latter is a new class to handle aspects of deployment (deployment itself, health checks on your deployment, shutdown of servers). Use LocalVLLMModel either by defining a ModelConfig or by passing info in the command line so that vllm can recognize your deployment or even deploy for you. If you have already deployed, pass "ports" parameter, otherwise the LocalVLLMDeploymentHandler will spin up "num_servers" (default = 1) servers for you, wait for deployment to finish, and continue with the eval pipeline. --------- Co-authored-by: Michael Harrison <[email protected]> Co-authored-by: Vidhisha Balachandran <[email protected]>
1 parent 45e802e commit bd1a02b

File tree

5 files changed

+369
-4
lines changed

5 files changed

+369
-4
lines changed

deploy_vllm_and_run_eval.sh

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/bin/bash
2+
3+
export PYTHONPATH="$(pwd):$PYTHONPATH"
4+
model_name="microsoft/phi-4"
5+
exp_config="IFEval_PIPELINE"
6+
current_datetime=$(date +"%Y-%m-%d-%H:%M:%S")
7+
log_dir="logs/deploy_vllm_and_run_eval/$current_datetime"
8+
mkdir -p $log_dir
9+
10+
# vLLM args
11+
num_servers=4
12+
tensor_parallel_size=1
13+
pipeline_parallel_size=1
14+
base_port=8000
15+
gpus_per_port=$((tensor_parallel_size * pipeline_parallel_size))
16+
17+
# Add any additional args accepted by vLLM serve here
18+
VLLM_ARGS="\
19+
--tensor-parallel-size=${tensor_parallel_size} \
20+
--pipeline-parallel-size=${pipeline_parallel_size} \
21+
--gpu-memory-utilization=0.9 \
22+
"
23+
24+
# Start servers
25+
echo "Spinning up servers..."
26+
for (( i = 0; i < $num_servers; i++ )) do
27+
port=$((base_port + i))
28+
first_gpu=$((i * gpus_per_port))
29+
last_gpu=$((first_gpu + gpus_per_port - 1))
30+
devices=$(seq -s, $first_gpu $last_gpu)
31+
CUDA_VISIBLE_DEVICES=${devices} vllm serve ${model_name} "$@" --port ${port} ${VLLM_ARGS} >> $log_dir/${port}.log 2>&1 &
32+
done
33+
34+
# Wait for servers to come online
35+
while true; do
36+
37+
servers_online=0
38+
for (( i = 0; i < $num_servers; i++ )) do
39+
port=$((base_port + i))
40+
url="http://0.0.0.0:${port}/health"
41+
response=$(curl -s -o /dev/null -w "%{http_code}" "$url")
42+
43+
if [ "$response" -eq 200 ]; then
44+
servers_online=$((servers_online + 1))
45+
fi
46+
done
47+
48+
if [ $servers_online -eq $num_servers ]; then
49+
echo "All servers are online."
50+
break
51+
else
52+
echo "Waiting for $((num_servers - servers_online)) more servers to come online..."
53+
fi
54+
55+
sleep 10
56+
done
57+
58+
# Call Eureka to initiate evals
59+
ports=$(seq -s ' ' $base_port $((base_port + num_servers - 1)))
60+
EUREKA_ARGS="\
61+
--model_config=${model_name} \
62+
--exp_config=${exp_config} \
63+
--local_vllm \
64+
--ports ${ports} \
65+
"
66+
echo "Starting evals..."
67+
python main.py ${EUREKA_ARGS} >> $log_dir/out.log 2>&1
68+
69+
# Shut down servers
70+
echo "Shutting down vLLM servers..."
71+
for (( i = 0; i < $num_servers; i++ )) do
72+
port=$((base_port + i))
73+
logfile="$log_dir/${port}.log"
74+
pid=$(grep "Started server process" $logfile | grep -o '[0-9]\+')
75+
echo "Shutting down server on port ${port} (PID ${pid})"
76+
kill -INT $pid
77+
done

eureka_ml_insights/configs/model_configs.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
LlamaServerlessAzureRestEndpointModel,
1313
LLaVAHuggingFaceModel,
1414
LLaVAModel,
15+
LocalVLLMModel,
1516
Phi4HFModel,
1617
MistralServerlessAzureRestEndpointModel,
1718
DeepseekR1ServerlessAzureRestEndpointModel,
@@ -297,6 +298,27 @@
297298
},
298299
)
299300

301+
# Local VLLM Models
302+
# Adapt to your local deployments, or give enough info for vllm deployment.
303+
PHI4_LOCAL_CONFIG = ModelConfig(
304+
LocalVLLMModel,
305+
{
306+
# this name must match the vllm deployment name/path
307+
"model_name": "microsoft/phi-4",
308+
# specify ports in case the model is already deployed
309+
"ports": ["8002", "8003"],
310+
},
311+
)
312+
QWQ32B_LOCAL_CONFIG = ModelConfig(
313+
LocalVLLMModel,
314+
{
315+
# this name must match the vllm deployment name/path
316+
"model_name": "Qwen/QwQ-32B",
317+
# certain args will get passed to the vllm serve command
318+
"tensor_parallel_size": 2,
319+
},
320+
)
321+
300322
# DeepSeek R1 Endpoints on Azure
301323
DEEPSEEK_R1_CONFIG = ModelConfig(
302324
DeepseekR1ServerlessAzureRestEndpointModel,
@@ -311,4 +333,4 @@
311333
# the timeout parameter is passed to urllib.request.urlopen(request, timeout=self.timeout) in ServerlessAzureRestEndpointModel
312334
"timeout": 600,
313335
},
314-
)
336+
)

eureka_ml_insights/models/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
LlamaServerlessAzureRestEndpointModel,
1212
LLaVAHuggingFaceModel,
1313
LLaVAModel,
14+
LocalVLLMModel,
1415
MistralServerlessAzureRestEndpointModel,
1516
DeepseekR1ServerlessAzureRestEndpointModel,
1617
Phi3HFModel,
@@ -38,6 +39,7 @@
3839
LlamaServerlessAzureRestEndpointModel,
3940
DeepseekR1ServerlessAzureRestEndpointModel,
4041
LLaVAModel,
42+
LocalVLLMModel,
4143
RestEndpointModel,
4244
TestModel,
4345
VLLMModel,

0 commit comments

Comments
 (0)