Skip to content

Commit 716648c

Browse files
committed
More comprehensive update to instance status model
Also: update the tests Signed-off-by: Mike Spreitzer <mspreitz@us.ibm.com>
1 parent 797d241 commit 716648c

File tree

3 files changed

+82
-45
lines changed

3 files changed

+82
-45
lines changed

docs/launcher.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ Response:
152152
{
153153
"status": "started",
154154
"instance_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
155+
"options": "--model facebook/opt-125m --port 8000"
155156
}
156157
```
157158

@@ -244,6 +245,7 @@ Create a new vLLM instance with an auto-generated UUID.
244245
```json
245246
{
246247
"options": "--model MODEL_NAME --port PORT",
248+
"gpu_uuids": ["GPU-33", "GPU-86"],
247249
"env_vars": {
248250
"VAR_NAME": "value"
249251
}
@@ -253,6 +255,7 @@ Create a new vLLM instance with an auto-generated UUID.
253255
**Parameters:**
254256

255257
- `options` (required): Command-line options for vLLM
258+
- `gpu_uuids` (optional): List of GPU UUIDs
256259
- `env_vars` (optional): Dictionary of environment variables
257260

258261
**Response (201 Created):**
@@ -261,6 +264,11 @@ Create a new vLLM instance with an auto-generated UUID.
261264
{
262265
"status": "started",
263266
"instance_id": "uuid-string",
267+
"options": "--model MODEL_NAME --port PORT",
268+
"gpu_uuids": ["GPU-33", "GPU-86"],
269+
"env_vars": {
270+
"VAR_NAME": "value"
271+
}
264272
}
265273
```
266274

@@ -307,6 +315,11 @@ Stop and delete a specific vLLM instance.
307315
{
308316
"status": "terminated",
309317
"instance_id": "instance-id",
318+
"options": "--model MODEL_NAME --port PORT",
319+
"gpu_uuids": ["GPU-33", "GPU-86"],
320+
"env_vars": {
321+
"VAR_NAME": "value"
322+
}
310323
}
311324
```
312325

@@ -408,14 +421,23 @@ Get status information for all instances. `Detail` is `True` by default.
408421
{
409422
"status": "running",
410423
"instance_id": "id-1",
424+
"options": <options 1>,
425+
"gpu_uuids": <gpus 1>,
426+
"env_vars": <envars 1>
411427
},
412428
{
413429
"status": "stopped",
414430
"instance_id": "id-2",
431+
"options": <options 2>,
432+
"gpu_uuids": <gpus 2>,
433+
"env_vars": <envars 2>
415434
},
416435
{
417436
"status": "running",
418437
"instance_id": "id-3",
438+
"options": <options 3>,
439+
"gpu_uuids": <gpus 3>,
440+
"env_vars": <envars 3>
419441
}
420442
]
421443
}
@@ -444,6 +466,11 @@ Get status information for a specific instance.
444466
{
445467
"status": "running",
446468
"instance_id": "instance-id",
469+
"options": "--model MODEL_NAME --port PORT",
470+
"gpu_uuids": ["GPU-33", "GPU-86"],
471+
"env_vars": {
472+
"VAR_NAME": "value"
473+
}
447474
}
448475
```
449476

@@ -510,6 +537,7 @@ curl -X POST http://localhost:8001/v2/vllm/instances \
510537
-H "Content-Type: application/json" \
511538
-d '{
512539
"options": "--model meta-llama/Llama-2-7b-hf --port 8000 --tensor-parallel-size 2",
540+
"gpu_uuids": ["GPU-33", "GPU-86"],
513541
"env_vars": {
514542
"CUDA_VISIBLE_DEVICES": "0,1",
515543
"VLLM_ATTENTION_BACKEND": "FLASHINFER",
@@ -640,13 +668,15 @@ Pydantic model (data class) defining the configuration for a vLLM instance.
640668
**Attributes:**
641669

642670
- `options` (str): Command-line options passed to vLLM (e.g., `"--model meta-llama/Llama-2-7b --port 8000"`)
671+
- `gpu_uuids` (Optional[List[str]]): UUIDs of GPUs
643672
- `env_vars` (Optional[Dict[str, Any]]): Environment variables to set for the vLLM process
644673

645674
Ex:
646675

647676
```yaml
648677
{
649678
"options": "--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --port 8005",
679+
"gpu_uuids": ["GPU-33", "GPU-86"],
650680
"env_vars": {
651681
"VLLM_USE_V1": "1",
652682
"VLLM_LOGGING_LEVEL": "DEBUG"

inference_server/launcher/launcher.py

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ class VllmConfig(BaseModel):
6161
env_vars: Optional[Dict[str, str]] = None
6262

6363

64+
class HalfMade(Exception):
65+
"""Raised when something other than start is the first op on a VllmInstance"""
66+
67+
def __init__(self, instance_id):
68+
super().__init__()
69+
self.instance_id = instance_id
70+
71+
6472
class VllmInstance:
6573
"""Represents a single vLLM instance"""
6674

@@ -106,13 +114,20 @@ def __init__(
106114
f"launcher-{os.getpid()}-vllm-{instance_id}.log",
107115
)
108116

117+
def _make_state(self, status: str) -> dict:
118+
return {
119+
"status": status,
120+
"instance_id": self.instance_id,
121+
**self.config.model_dump(exclude_none=True),
122+
}
123+
109124
def start(self) -> dict:
110125
"""
111126
Start this vLLM instance
112127
:return: Status of the process.
113128
"""
114129
if self.process and self.process.is_alive():
115-
return {"status": "already_running", "instance_id": self.instance_id}
130+
return self._make_state("already_running")
116131

117132
# Create empty log file before spawning the child process
118133
open(self._log_file_path, "wb").close()
@@ -122,18 +137,17 @@ def start(self) -> dict:
122137
)
123138
self.process.start()
124139

125-
return {
126-
"status": "started",
127-
"instance_id": self.instance_id,
128-
}
140+
return self._make_state("started")
129141

130142
def stop(self, timeout: int = 10) -> dict:
131143
"""
132144
Stop existing vLLM instance
133145
:param timeout: waits for the process to stop, defaults to 10
134146
:return: a dictionary with the status "terminated"
135147
"""
136-
if not self.process or not self.process.is_alive():
148+
if self.process is None:
149+
raise HalfMade(self.instance_id)
150+
if not self.process.is_alive():
137151
self._cleanup_log_file()
138152
return {
139153
"status": "not_running",
@@ -156,10 +170,7 @@ def stop(self, timeout: int = 10) -> dict:
156170
self.process.join()
157171

158172
self._cleanup_log_file()
159-
return {
160-
"status": "terminated",
161-
"instance_id": self.instance_id,
162-
}
173+
return self._make_state("terminated")
163174

164175
def _cleanup_log_file(self):
165176
"""Remove the log file if it exists."""
@@ -173,12 +184,9 @@ def get_status(self) -> dict:
173184
Returns the status of the process
174185
:return: Status of the running process.
175186
"""
176-
177-
return {
178-
"status": "running" if self.process.is_alive() else "stopped",
179-
"instance_id": self.instance_id,
180-
**self.config.model_dump(),
181-
}
187+
if self.process is None:
188+
raise HalfMade(self.instance_id)
189+
return self._make_state("running" if self.process.is_alive() else "stopped")
182190

183191
def get_log_bytes(
184192
self, start: int = 0, end: int | None = None
@@ -246,6 +254,9 @@ def create_instance(
246254
)
247255
self.instances[instance_id] = instance
248256

257+
# Because start() is always the first method called on a VllmInstance,
258+
# the other methods of VllmMultiProcessManager do not need to handle
259+
# the HalfMade exception.
249260
return instance.start()
250261

251262
def stop_instance(self, instance_id: str, timeout: int = 10) -> dict:

inference_server/launcher/tests/test_launcher.py

Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
# Import the application and classes
3838
from launcher import ( # noqa: E402
3939
MAX_LOG_RESPONSE_BYTES,
40+
HalfMade,
4041
LogRangeNotAvailable,
4142
VllmConfig,
4243
VllmInstance,
@@ -147,7 +148,7 @@ def test_instance_creation(self, vllm_config, gpu_translator, tmp_log_dir):
147148

148149
@patch("launcher.multiprocessing.Process")
149150
def test_instance_start(
150-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
151+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
151152
):
152153
"""Test starting a vLLM instance"""
153154
mock_process = MockProcess()
@@ -160,11 +161,13 @@ def test_instance_start(
160161

161162
assert result["status"] == "started"
162163
assert result["instance_id"] == "test-id"
164+
for key, val in vllm_config.model_dump(exclude_none=True).items():
165+
assert result[key] == val
163166
assert os.path.exists(instance._log_file_path)
164167

165168
@patch("launcher.multiprocessing.Process")
166169
def test_instance_start_already_running(
167-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
170+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
168171
):
169172
"""Test starting an instance that's already running"""
170173
mock_process = MockProcess()
@@ -178,10 +181,12 @@ def test_instance_start_already_running(
178181

179182
assert result["status"] == "already_running"
180183
assert result["instance_id"] == "test-id"
184+
for key, val in vllm_config.model_dump(exclude_none=True).items():
185+
assert result[key] == val
181186

182187
@patch("launcher.multiprocessing.Process")
183188
def test_instance_stop(
184-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
189+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
185190
):
186191
"""Test stopping a running instance"""
187192
mock_process = MockProcess()
@@ -195,6 +200,8 @@ def test_instance_stop(
195200

196201
assert result["status"] == "terminated"
197202
assert result["instance_id"] == "test-id"
203+
for key, val in vllm_config.model_dump(exclude_none=True).items():
204+
assert result[key] == val
198205
assert mock_process.terminated is True
199206

200207
@patch("launcher.multiprocessing.Process")
@@ -203,10 +210,11 @@ def test_instance_stop_not_running(self, vllm_config, gpu_translator, tmp_log_di
203210
instance = VllmInstance(
204211
"test-id", vllm_config, gpu_translator, log_dir=tmp_log_dir
205212
)
206-
result = instance.stop()
207-
208-
assert result["status"] == "not_running"
209-
assert result["instance_id"] == "test-id"
213+
try:
214+
_ = instance.stop()
215+
assert False
216+
except HalfMade:
217+
assert True
210218

211219
@patch("launcher.os.killpg")
212220
@patch("launcher.multiprocessing.Process")
@@ -244,7 +252,7 @@ def join_side_effect(timeout=None):
244252

245253
@patch("launcher.multiprocessing.Process")
246254
def test_instance_get_status(
247-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
255+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
248256
):
249257
"""Test getting instance status"""
250258
mock_process = MockProcess()
@@ -258,14 +266,15 @@ def test_instance_get_status(
258266
instance.start()
259267
status = instance.get_status()
260268
assert status["status"] == "running"
261-
assert status["options"] == vllm_config.options
262-
assert status["env_vars"] == vllm_config.env_vars
269+
for key, val in vllm_config.model_dump(exclude_none=True).items():
270+
assert status[key] == val
263271

264272
# Stopped
265273
mock_process._is_alive = False
266274
status = instance.get_status()
267275
assert status["status"] == "stopped"
268-
assert status["options"] == vllm_config.options
276+
for key, val in vllm_config.model_dump(exclude_none=True).items():
277+
assert status[key] == val
269278

270279
@patch("launcher.multiprocessing.Process")
271280
def test_instance_uuid_to_index_translation(
@@ -466,7 +475,7 @@ def test_stop_all_instances(self, mock_process_class, manager, vllm_config):
466475
assert len(manager.instances) == 0
467476

468477
@patch("launcher.multiprocessing.Process")
469-
def test_get_instance_status(self, mock_process_class, manager, vllm_config):
478+
def test_get_instance_status(self, mock_process_class, manager, vllm_config: VllmConfig):
470479
"""Test getting status of specific instance"""
471480
mock_process = MockProcess()
472481
mock_process_class.return_value = mock_process
@@ -476,8 +485,8 @@ def test_get_instance_status(self, mock_process_class, manager, vllm_config):
476485

477486
assert status["status"] == "running"
478487
assert status["instance_id"] == "test-id"
479-
assert status["options"] == vllm_config.options
480-
assert status["env_vars"] == vllm_config.env_vars
488+
for key, val in vllm_config.model_dump(exclude_none=True).items():
489+
assert status[key] == val
481490

482491
@patch("launcher.multiprocessing.Process")
483492
def test_get_instance_status_nonexistent(self, mock_process_class, manager):
@@ -500,8 +509,8 @@ def test_get_all_instances_status(self, mock_process_class, manager, vllm_config
500509
assert status["running_instances"] == 2
501510
assert len(status["instances"]) == 2
502511
for inst in status["instances"]:
503-
assert inst["options"] == vllm_config.options
504-
assert inst["env_vars"] == vllm_config.env_vars
512+
for key, val in vllm_config.model_dump(exclude_none=True).items():
513+
assert inst[key] == val
505514

506515
@patch("launcher.multiprocessing.Process")
507516
def test_list_instances(self, mock_process_class, manager, vllm_config):
@@ -1073,19 +1082,6 @@ def test_stop_terminated_cleans_up_log_file(
10731082
instance.stop()
10741083
assert not os.path.exists(instance._log_file_path)
10751084

1076-
@patch("launcher.multiprocessing.Process")
1077-
def test_stop_not_running_cleans_up_log_file(
1078-
self, mock_process_class, gpu_translator, tmp_log_dir
1079-
):
1080-
"""Test that stop() removes the log file when process is not running"""
1081-
instance = self._make_instance(gpu_translator, tmp_log_dir)
1082-
# Create a log file manually
1083-
open(instance._log_file_path, "wb").close()
1084-
assert os.path.exists(instance._log_file_path)
1085-
1086-
instance.stop()
1087-
assert not os.path.exists(instance._log_file_path)
1088-
10891085
def test_cleanup_missing_file_no_error(self, gpu_translator, tmp_log_dir):
10901086
"""Test that _cleanup_log_file does not raise if file doesn't exist"""
10911087
instance = self._make_instance(gpu_translator, tmp_log_dir)

0 commit comments

Comments
 (0)