Skip to content

Commit 49b3991

Browse files
committed
More comprehensive update to instance status model
Also: update the tests Signed-off-by: Mike Spreitzer <mspreitz@us.ibm.com>
1 parent 797d241 commit 49b3991

File tree

3 files changed

+76
-24
lines changed

3 files changed

+76
-24
lines changed

docs/launcher.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ Response:
152152
{
153153
"status": "started",
154154
"instance_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
155+
"options": "--model facebook/opt-125m --port 8000"
155156
}
156157
```
157158

@@ -244,6 +245,7 @@ Create a new vLLM instance with an auto-generated UUID.
244245
```json
245246
{
246247
"options": "--model MODEL_NAME --port PORT",
248+
"gpu_uuids": ["GPU-33", "GPU-86"],
247249
"env_vars": {
248250
"VAR_NAME": "value"
249251
}
@@ -253,6 +255,7 @@ Create a new vLLM instance with an auto-generated UUID.
253255
**Parameters:**
254256

255257
- `options` (required): Command-line options for vLLM
258+
- `gpu_uuids` (optional): List of GPU UUIDs
256259
- `env_vars` (optional): Dictionary of environment variables
257260

258261
**Response (201 Created):**
@@ -261,6 +264,11 @@ Create a new vLLM instance with an auto-generated UUID.
261264
{
262265
"status": "started",
263266
"instance_id": "uuid-string",
267+
"options": "--model MODEL_NAME --port PORT",
268+
"gpu_uuids": ["GPU-33", "GPU-86"],
269+
"env_vars": {
270+
"VAR_NAME": "value"
271+
}
264272
}
265273
```
266274

@@ -307,6 +315,11 @@ Stop and delete a specific vLLM instance.
307315
{
308316
"status": "terminated",
309317
"instance_id": "instance-id",
318+
"options": "--model MODEL_NAME --port PORT",
319+
"gpu_uuids": ["GPU-33", "GPU-86"],
320+
"env_vars": {
321+
"VAR_NAME": "value"
322+
}
310323
}
311324
```
312325

@@ -408,14 +421,23 @@ Get status information for all instances. `Detail` is `True` by default.
408421
{
409422
"status": "running",
410423
"instance_id": "id-1",
424+
"options": <options 1>,
425+
"gpu_uuids": <gpus 1>,
426+
"env_vars": <envars 1>
411427
},
412428
{
413429
"status": "stopped",
414430
"instance_id": "id-2",
431+
"options": <options 2>,
432+
"gpu_uuids": <gpus 2>,
433+
"env_vars": <envars 2>
415434
},
416435
{
417436
"status": "running",
418437
"instance_id": "id-3",
438+
"options": <options 3>,
439+
"gpu_uuids": <gpus 3>,
440+
"env_vars": <envars 3>
419441
}
420442
]
421443
}
@@ -444,6 +466,11 @@ Get status information for a specific instance.
444466
{
445467
"status": "running",
446468
"instance_id": "instance-id",
469+
"options": "--model MODEL_NAME --port PORT",
470+
"gpu_uuids": ["GPU-33", "GPU-86"],
471+
"env_vars": {
472+
"VAR_NAME": "value"
473+
}
447474
}
448475
```
449476

@@ -510,6 +537,7 @@ curl -X POST http://localhost:8001/v2/vllm/instances \
510537
-H "Content-Type: application/json" \
511538
-d '{
512539
"options": "--model meta-llama/Llama-2-7b-hf --port 8000 --tensor-parallel-size 2",
540+
"gpu_uuids": ["GPU-33", "GPU-86"],
513541
"env_vars": {
514542
"CUDA_VISIBLE_DEVICES": "0,1",
515543
"VLLM_ATTENTION_BACKEND": "FLASHINFER",
@@ -640,13 +668,15 @@ Pydantic model (data class) defining the configuration for a vLLM instance.
640668
**Attributes:**
641669

642670
- `options` (str): Command-line options passed to vLLM (e.g., `"--model meta-llama/Llama-2-7b --port 8000"`)
671+
- `gpu_uuids` (Optional[List[str]]): UUIDs of GPUs
643672
- `env_vars` (Optional[Dict[str, Any]]): Environment variables to set for the vLLM process
644673

645674
Ex:
646675

647676
```yaml
648677
{
649678
"options": "--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --port 8005",
679+
"gpu_uuids": ["GPU-33", "GPU-86"],
650680
"env_vars": {
651681
"VLLM_USE_V1": "1",
652682
"VLLM_LOGGING_LEVEL": "DEBUG"

inference_server/launcher/launcher.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ class VllmConfig(BaseModel):
6161
env_vars: Optional[Dict[str, str]] = None
6262

6363

64+
class HalfStarted(Exception):
65+
"""Raised when something other than start is the first op on a VllmInstance"""
66+
67+
def __init__(self):
68+
super().__init__()
69+
70+
6471
class VllmInstance:
6572
"""Represents a single vLLM instance"""
6673

@@ -106,13 +113,20 @@ def __init__(
106113
f"launcher-{os.getpid()}-vllm-{instance_id}.log",
107114
)
108115

116+
def _make_state(self, status: str) -> dict:
117+
return {
118+
"status": status,
119+
"instance_id": self.instance_id,
120+
**self.config.model_dump(exclude_none=True),
121+
}
122+
109123
def start(self) -> dict:
110124
"""
111125
Start this vLLM instance
112126
:return: Status of the process.
113127
"""
114128
if self.process and self.process.is_alive():
115-
return {"status": "already_running", "instance_id": self.instance_id}
129+
return self._make_state("already_running")
116130

117131
# Create empty log file before spawning the child process
118132
open(self._log_file_path, "wb").close()
@@ -122,18 +136,17 @@ def start(self) -> dict:
122136
)
123137
self.process.start()
124138

125-
return {
126-
"status": "started",
127-
"instance_id": self.instance_id,
128-
}
139+
return self._make_state("started")
129140

130141
def stop(self, timeout: int = 10) -> dict:
131142
"""
132143
Stop existing vLLM instance
133144
:param timeout: waits for the process to stop, defaults to 10
134145
:return: a dictionary with the status "terminated"
135146
"""
136-
if not self.process or not self.process.is_alive():
147+
if self.process is None:
148+
raise HalfStarted()
149+
if not self.process.is_alive():
137150
self._cleanup_log_file()
138151
return {
139152
"status": "not_running",
@@ -156,10 +169,7 @@ def stop(self, timeout: int = 10) -> dict:
156169
self.process.join()
157170

158171
self._cleanup_log_file()
159-
return {
160-
"status": "terminated",
161-
"instance_id": self.instance_id,
162-
}
172+
return self._make_state("terminated")
163173

164174
def _cleanup_log_file(self):
165175
"""Remove the log file if it exists."""
@@ -173,12 +183,9 @@ def get_status(self) -> dict:
173183
Returns the status of the process
174184
:return: Status of the running process.
175185
"""
176-
177-
return {
178-
"status": "running" if self.process.is_alive() else "stopped",
179-
"instance_id": self.instance_id,
180-
**self.config.model_dump(),
181-
}
186+
if self.process is None:
187+
raise HalfStarted()
188+
return self._make_state("running" if self.process.is_alive() else "stopped")
182189

183190
def get_log_bytes(
184191
self, start: int = 0, end: int | None = None
@@ -246,6 +253,9 @@ def create_instance(
246253
)
247254
self.instances[instance_id] = instance
248255

256+
"""Because start() is always the first method called on a VllmInstance,
257+
the other methods of VllmMultiProcessManager do not need to handle
258+
the HalfStarted exception."""
249259
return instance.start()
250260

251261
def stop_instance(self, instance_id: str, timeout: int = 10) -> dict:

inference_server/launcher/tests/test_launcher.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
LogRangeNotAvailable,
4141
VllmConfig,
4242
VllmInstance,
43+
HalfStarted,
4344
VllmMultiProcessManager,
4445
app,
4546
parse_range_header,
@@ -147,7 +148,7 @@ def test_instance_creation(self, vllm_config, gpu_translator, tmp_log_dir):
147148

148149
@patch("launcher.multiprocessing.Process")
149150
def test_instance_start(
150-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
151+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
151152
):
152153
"""Test starting a vLLM instance"""
153154
mock_process = MockProcess()
@@ -160,11 +161,13 @@ def test_instance_start(
160161

161162
assert result["status"] == "started"
162163
assert result["instance_id"] == "test-id"
164+
for key, val in vllm_config.model_dump(exclude_none=True).items():
165+
assert result[key] == val
163166
assert os.path.exists(instance._log_file_path)
164167

165168
@patch("launcher.multiprocessing.Process")
166169
def test_instance_start_already_running(
167-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
170+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
168171
):
169172
"""Test starting an instance that's already running"""
170173
mock_process = MockProcess()
@@ -178,10 +181,12 @@ def test_instance_start_already_running(
178181

179182
assert result["status"] == "already_running"
180183
assert result["instance_id"] == "test-id"
184+
for key, val in vllm_config.model_dump(exclude_none=True).items():
185+
assert result[key] == val
181186

182187
@patch("launcher.multiprocessing.Process")
183188
def test_instance_stop(
184-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
189+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
185190
):
186191
"""Test stopping a running instance"""
187192
mock_process = MockProcess()
@@ -195,6 +200,8 @@ def test_instance_stop(
195200

196201
assert result["status"] == "terminated"
197202
assert result["instance_id"] == "test-id"
203+
for key, val in vllm_config.model_dump(exclude_none=True).items():
204+
assert result[key] == val
198205
assert mock_process.terminated is True
199206

200207
@patch("launcher.multiprocessing.Process")
@@ -203,10 +210,11 @@ def test_instance_stop_not_running(self, vllm_config, gpu_translator, tmp_log_di
203210
instance = VllmInstance(
204211
"test-id", vllm_config, gpu_translator, log_dir=tmp_log_dir
205212
)
206-
result = instance.stop()
207-
208-
assert result["status"] == "not_running"
209-
assert result["instance_id"] == "test-id"
213+
try:
214+
result = instance.stop()
215+
assert False
216+
except HalfStarted:
217+
assert True
210218

211219
@patch("launcher.os.killpg")
212220
@patch("launcher.multiprocessing.Process")
@@ -244,7 +252,7 @@ def join_side_effect(timeout=None):
244252

245253
@patch("launcher.multiprocessing.Process")
246254
def test_instance_get_status(
247-
self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
255+
self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
248256
):
249257
"""Test getting instance status"""
250258
mock_process = MockProcess()
@@ -260,12 +268,16 @@ def test_instance_get_status(
260268
assert status["status"] == "running"
261269
assert status["options"] == vllm_config.options
262270
assert status["env_vars"] == vllm_config.env_vars
271+
for key, val in vllm_config.model_dump(exclude_none=True).items():
272+
assert status[key] == val
263273

264274
# Stopped
265275
mock_process._is_alive = False
266276
status = instance.get_status()
267277
assert status["status"] == "stopped"
268278
assert status["options"] == vllm_config.options
279+
for key, val in vllm_config.model_dump(exclude_none=True).items():
280+
assert status[key] == val
269281

270282
@patch("launcher.multiprocessing.Process")
271283
def test_instance_uuid_to_index_translation(

0 commit comments

Comments
 (0)