Skip to content

Commit 5ed17e4

Browse files
author
Michael Harrison
committed
improved localvllm logic
1 parent bb6f0b3 commit 5ed17e4

File tree

5 files changed

+110
-105
lines changed

5 files changed

+110
-105
lines changed

deploy_and_run.sh

-61
This file was deleted.

deploy_vllm_and_run_eval.sh

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/bin/bash
2+
3+
export PYTHONPATH="$(pwd):$PYTHONPATH"
4+
model_name="microsoft/phi-4"
5+
exp_config="IFEval_PIPELINE"
6+
current_datetime=$(date +"%Y-%m-%d-%H:%M:%S")
7+
log_dir="logs/deploy_vllm_and_run_eval/$current_datetime"
8+
mkdir -p $log_dir
9+
10+
# vLLM args
11+
num_servers=4
12+
tensor_parallel_size=1
13+
pipeline_parallel_size=1
14+
base_port=8000
15+
gpus_per_port=$((tensor_parallel_size * pipeline_parallel_size))
16+
17+
# Add any additional args accepted by vLLM serve here
18+
VLLM_ARGS="\
19+
--tensor-parallel-size=${tensor_parallel_size} \
20+
--pipeline-parallel-size=${pipeline_parallel_size} \
21+
--gpu-memory-utilization=0.9 \
22+
"
23+
24+
# Start servers
25+
echo "Spinning up servers..."
26+
for (( i = 0; i < $num_servers; i++ )) do
27+
port=$((base_port + i))
28+
first_gpu=$((i * gpus_per_port))
29+
last_gpu=$((first_gpu + gpus_per_port - 1))
30+
devices=$(seq -s, $first_gpu $last_gpu)
31+
CUDA_VISIBLE_DEVICES=${devices} vllm serve ${model_name} "$@" --port ${port} ${VLLM_ARGS} >> $log_dir/${port}.log 2>&1 &
32+
done
33+
34+
# Wait for servers to come online
35+
while true; do
36+
37+
servers_online=0
38+
for (( i = 0; i < $num_servers; i++ )) do
39+
port=$((base_port + i))
40+
url="http://0.0.0.0:${port}/health"
41+
response=$(curl -s -o /dev/null -w "%{http_code}" "$url")
42+
43+
if [ "$response" -eq 200 ]; then
44+
servers_online=$((servers_online + 1))
45+
fi
46+
done
47+
48+
if [ $servers_online -eq $num_servers ]; then
49+
echo "All servers are online."
50+
break
51+
else
52+
echo "Waiting for $((num_servers - servers_online)) more servers to come online..."
53+
fi
54+
55+
sleep 10
56+
done
57+
58+
# Call Eureka to initiate evals
59+
ports=$(seq -s ' ' $base_port $((base_port + num_servers - 1)))
60+
EUREKA_ARGS="\
61+
--model_config=${model_name} \
62+
--exp_config=${exp_config} \
63+
--local_vllm \
64+
--ports ${ports} \
65+
"
66+
echo "Starting evals..."
67+
python main.py ${EUREKA_ARGS} >> $log_dir/out.log 2>&1
68+
69+
# Shut down servers
70+
echo "Shutting down vLLM servers..."
71+
for (( i = 0; i < $num_servers; i++ )) do
72+
port=$((base_port + i))
73+
logfile="$log_dir/${port}.log"
74+
pid=$(grep "Started server process" $logfile | grep -o '[0-9]\+')
75+
echo "Shutting down server on port ${port} (PID ${pid})"
76+
kill -INT $pid
77+
done

eureka_ml_insights/models/models.py

+33-24
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22

33
import json
44
import logging
5+
import random
6+
import re
57
import requests
68
import time
79
import urllib.request
810
from abc import ABC, abstractmethod
11+
from concurrent.futures import ThreadPoolExecutor
912
from dataclasses import dataclass
10-
import random
1113

1214
import anthropic
1315
import tiktoken
@@ -1157,7 +1159,7 @@ def model_template_fn(self, text_prompt, system_message=None):
11571159
raise NotImplementedError
11581160

11591161

1160-
class LocalVLLMDeploymentHandler:
1162+
class _LocalVLLMDeploymentHandler:
11611163
"""This class is used to handle the deployment of vLLM servers."""
11621164

11631165
# Chose against dataclass here so we have the option to accept kwargs
@@ -1241,22 +1243,35 @@ def get_healthy_ports(self) -> list[str]:
12411243
return healthy_ports
12421244

12431245
def deploy_servers(self):
1246+
"""Deploy vLLM servers in background threads using the specified parameters."""
1247+
12441248
logging.info(f"No vLLM servers are running. Starting {self.num_servers} new servers at {self.ports}.")
1245-
import os, subprocess, sys, datetime
1246-
1247-
env = os.environ.copy()
1248-
env['NUM_SERVERS'] = str(self.num_servers)
1249-
env['CURRENT_PYTHON_EXEC'] = sys.executable
1250-
env['GPU_SKIP'] = str(self.pipeline_parallel_size * self.tensor_parallel_size)
1249+
import os, datetime
12511250

1251+
gpus_per_port = self.pipeline_parallel_size * self.tensor_parallel_size
12521252
date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S.%f")
12531253
log_dir = os.path.join("logs", "local_vllm_deployment_logs", f"{date}")
12541254
os.makedirs(log_dir)
1255-
env['LOCAL_VLLM_LOG_DIR'] = log_dir
1255+
1256+
executor = ThreadPoolExecutor(max_workers=self.num_servers)
1257+
futures = [executor.submit(lambda index: self.deploy_server(index, gpus_per_port, log_dir), i) for i in range(self.num_servers)]
1258+
1259+
def deploy_server(self, index: int, gpus_per_port: int, log_dir: str):
1260+
"""Deploy a single vLLM server using gpus_per_port many gpus starting at index*gpus_per_port."""
1261+
1262+
import os, subprocess
1263+
1264+
port = 8000 + index
1265+
first_gpu = index * gpus_per_port
1266+
last_gpu = first_gpu + gpus_per_port - 1
1267+
devices = ",".join(str(gpu_num) for gpu_num in range(first_gpu, last_gpu + 1))
1268+
log_file = os.path.join(log_dir, f"{port}.log")
12561269

12571270
command = [
1258-
os.path.dirname(os.path.abspath(__file__)) + "/vllm_deployment_script.sh",
1259-
"--model", self.model_name,
1271+
"CUDA_VISIBLE_DEVICES=" + devices,
1272+
"vllm serve",
1273+
self.model_name,
1274+
"--port", str(port),
12601275
"--tensor_parallel_size", str(self.tensor_parallel_size),
12611276
"--pipeline_parallel_size", str(self.pipeline_parallel_size),
12621277
"--dtype", self.dtype,
@@ -1269,17 +1284,11 @@ def deploy_servers(self):
12691284
command.append(self.quantization)
12701285
if self.trust_remote_code:
12711286
command.append("--trust_remote_code")
1287+
#command.append(">> " + log_file + " 2>&1 &")
1288+
command = " ".join(command)
12721289
logging.info(f"Running command: {command}")
1273-
response = subprocess.run(command, text=True, env=env)
1274-
return response
1275-
1276-
@classmethod
1277-
def shutdown_servers(cls):
1278-
# Consider whether this is appropriate since it will probably kill all vLLM servers.
1279-
import subprocess
1280-
logging.info(f"Shutting down vLLM servers.")
1281-
command = [f'pgrep -f "vllm.entrypoints.openai.api_server --model" | xargs kill -INT']
1282-
subprocess.run(command, shell=True)
1290+
with open(log_file, 'w') as log_writer:
1291+
subprocess.run(command, shell=True, stdout=log_writer, stderr=log_writer)
12831292

12841293

12851294
@dataclass
@@ -1301,7 +1310,7 @@ class LocalVLLMModel(Model, OpenAICommonRequestResponseMixIn):
13011310

13021311
# Deployment handler
13031312
ports: list = None
1304-
handler: LocalVLLMDeploymentHandler = None
1313+
handler: _LocalVLLMDeploymentHandler = None
13051314

13061315
# Inference parameters
13071316
temperature: float = 0.01
@@ -1312,7 +1321,7 @@ class LocalVLLMModel(Model, OpenAICommonRequestResponseMixIn):
13121321
def __post_init__(self):
13131322
if not self.model_name:
13141323
raise ValueError("LocalVLLM model_name must be specified.")
1315-
self.handler = LocalVLLMDeploymentHandler(
1324+
self.handler = _LocalVLLMDeploymentHandler(
13161325
model_name=self.model_name,
13171326
num_servers=self.num_servers,
13181327
trust_remote_code=self.trust_remote_code,
@@ -1351,7 +1360,7 @@ def generate(self, text_prompt, query_images=None, system_message=None):
13511360
response_dict = {}
13521361

13531362
if text_prompt:
1354-
# Format request for OpenAI API using create_request from OpenAIRequestResponseMixIn
1363+
# Format request for OpenAI API using create_request from OpenAICommonRequestResponseMixIn
13551364
request = self.create_request(text_prompt, query_images, system_message)
13561365
try:
13571366
response_dict.update(self._generate(request))

eureka_ml_insights/models/vllm_deployment_script.sh

-11
This file was deleted.

main.py

-9
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,3 @@
8181
logging.info(f"Saving experiment logs in {pipeline_config.log_dir}.")
8282
pipeline = Pipeline(pipeline_config.component_configs, pipeline_config.log_dir)
8383
pipeline.run()
84-
85-
# Shut down vllm servers.
86-
if args.local_vllm:
87-
try:
88-
from eureka_ml_insights.models.models import LocalVLLMDeploymentHandler
89-
LocalVLLMDeploymentHandler.shutdown_servers()
90-
except:
91-
logging.warning("Failed to shut down local vllm servers.")
92-

0 commit comments

Comments
 (0)