More comprehensive update to instance status model

MikeSpreitzer · MikeSpreitzer · commit 49b399134fd1 · 2026-03-24T15:04:17.000-04:00
Also: update the tests

Signed-off-by: Mike Spreitzer &lt;mspreitz@us.ibm.com&gt;
diff --git a/docs/launcher.md b/docs/launcher.md
@@ -152,6 +152,7 @@ Response:
 {
   "status": "started",
   "instance_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
+  "options": "--model facebook/opt-125m --port 8000"
 }
 ```
 
@@ -244,6 +245,7 @@ Create a new vLLM instance with an auto-generated UUID.
 ```json
 {
   "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
   "env_vars": {
     "VAR_NAME": "value"
   }
@@ -253,6 +255,7 @@ Create a new vLLM instance with an auto-generated UUID.
 **Parameters:**
 
 - `options` (required): Command-line options for vLLM
+- `gpu_uuids` (optional): List of GPU UUIDs
 - `env_vars` (optional): Dictionary of environment variables
 
 **Response (201 Created):**
@@ -261,6 +264,11 @@ Create a new vLLM instance with an auto-generated UUID.
 {
   "status": "started",
   "instance_id": "uuid-string",
+  "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
+  "env_vars": {
+    "VAR_NAME": "value"
+  }
 }
 ```
 
@@ -307,6 +315,11 @@ Stop and delete a specific vLLM instance.
 {
   "status": "terminated",
   "instance_id": "instance-id",
+  "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
+  "env_vars": {
+    "VAR_NAME": "value"
+  }
 }
 ```
 
@@ -408,14 +421,23 @@ Get status information for all instances. `Detail` is `True` by default.
     {
       "status": "running",
       "instance_id": "id-1",
+      "options": <options 1>,
+      "gpu_uuids": <gpus 1>,
+      "env_vars": <envars 1>
     },
     {
       "status": "stopped",
       "instance_id": "id-2",
+      "options": <options 2>,
+      "gpu_uuids": <gpus 2>,
+      "env_vars": <envars 2>
     },
     {
       "status": "running",
       "instance_id": "id-3",
+      "options": <options 3>,
+      "gpu_uuids": <gpus 3>,
+      "env_vars": <envars 3>
     }
   ]
 }
@@ -444,6 +466,11 @@ Get status information for a specific instance.
 {
   "status": "running",
   "instance_id": "instance-id",
+  "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
+  "env_vars": {
+    "VAR_NAME": "value"
+  }
 }
 ```
 
@@ -510,6 +537,7 @@ curl -X POST http://localhost:8001/v2/vllm/instances \
   -H "Content-Type: application/json" \
   -d '{
     "options": "--model meta-llama/Llama-2-7b-hf --port 8000 --tensor-parallel-size 2",
+    "gpu_uuids": ["GPU-33", "GPU-86"],
     "env_vars": {
       "CUDA_VISIBLE_DEVICES": "0,1",
       "VLLM_ATTENTION_BACKEND": "FLASHINFER",
@@ -640,13 +668,15 @@ Pydantic model (data class) defining the configuration for a vLLM instance.
 **Attributes:**
 
 - `options` (str): Command-line options passed to vLLM (e.g., `"--model meta-llama/Llama-2-7b --port 8000"`)
+- `gpu_uuids` (Optional[List[str]]): UUIDs of GPUs
 - `env_vars` (Optional[Dict[str, Any]]): Environment variables to set for the vLLM process
 
 Ex:
 
 ```yaml
 {
   "options": "--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --port 8005",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
   "env_vars": {
     "VLLM_USE_V1": "1",
     "VLLM_LOGGING_LEVEL": "DEBUG"
diff --git a/inference_server/launcher/launcher.py b/inference_server/launcher/launcher.py
@@ -61,6 +61,13 @@ class VllmConfig(BaseModel):
     env_vars: Optional[Dict[str, str]] = None
 
 
+class HalfStarted(Exception):
+    """Raised when something other than start is the first op on a VllmInstance"""
+
+    def __init__(self):
+        super().__init__()
+
+
 class VllmInstance:
     """Represents a single vLLM instance"""
 
@@ -106,13 +113,20 @@ def __init__(
             f"launcher-{os.getpid()}-vllm-{instance_id}.log",
         )
 
+    def _make_state(self, status: str) -> dict:
+        return {
+            "status": status,
+            "instance_id": self.instance_id,
+            **self.config.model_dump(exclude_none=True),
+        }
+
     def start(self) -> dict:
         """
         Start this vLLM instance
         :return: Status of the process.
         """
         if self.process and self.process.is_alive():
-            return {"status": "already_running", "instance_id": self.instance_id}
+            return self._make_state("already_running")
 
         # Create empty log file before spawning the child process
         open(self._log_file_path, "wb").close()
@@ -122,18 +136,17 @@ def start(self) -> dict:
         )
         self.process.start()
 
-        return {
-            "status": "started",
-            "instance_id": self.instance_id,
-        }
+        return self._make_state("started")
 
     def stop(self, timeout: int = 10) -> dict:
         """
         Stop existing vLLM instance
         :param timeout: waits for the process to stop, defaults to 10
         :return: a dictionary with the status "terminated"
         """
-        if not self.process or not self.process.is_alive():
+        if self.process is None:
+            raise HalfStarted()
+        if not self.process.is_alive():
             self._cleanup_log_file()
             return {
                 "status": "not_running",
@@ -156,10 +169,7 @@ def stop(self, timeout: int = 10) -> dict:
             self.process.join()
 
         self._cleanup_log_file()
-        return {
-            "status": "terminated",
-            "instance_id": self.instance_id,
-        }
+        return self._make_state("terminated")
 
     def _cleanup_log_file(self):
         """Remove the log file if it exists."""
@@ -173,12 +183,9 @@ def get_status(self) -> dict:
         Returns the status of the process
         :return: Status of the running process.
         """
-
-        return {
-            "status": "running" if self.process.is_alive() else "stopped",
-            "instance_id": self.instance_id,
-            **self.config.model_dump(),
-        }
+        if self.process is None:
+            raise HalfStarted()
+        return self._make_state("running" if self.process.is_alive() else "stopped")
 
     def get_log_bytes(
         self, start: int = 0, end: int | None = None
@@ -246,6 +253,9 @@ def create_instance(
         )
         self.instances[instance_id] = instance
 
+        """Because start() is always the first method called on a VllmInstance,
+        the other methods of VllmMultiProcessManager do not need to handle
+        the HalfStarted exception."""
         return instance.start()
 
     def stop_instance(self, instance_id: str, timeout: int = 10) -> dict:
diff --git a/inference_server/launcher/tests/test_launcher.py b/inference_server/launcher/tests/test_launcher.py
@@ -40,6 +40,7 @@
     LogRangeNotAvailable,
     VllmConfig,
     VllmInstance,
+    HalfStarted,
     VllmMultiProcessManager,
     app,
     parse_range_header,
@@ -147,7 +148,7 @@ def test_instance_creation(self, vllm_config, gpu_translator, tmp_log_dir):
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_start(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test starting a vLLM instance"""
         mock_process = MockProcess()
@@ -160,11 +161,13 @@ def test_instance_start(
 
         assert result["status"] == "started"
         assert result["instance_id"] == "test-id"
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert result[key] == val
         assert os.path.exists(instance._log_file_path)
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_start_already_running(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test starting an instance that's already running"""
         mock_process = MockProcess()
@@ -178,10 +181,12 @@ def test_instance_start_already_running(
 
         assert result["status"] == "already_running"
         assert result["instance_id"] == "test-id"
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert result[key] == val
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_stop(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test stopping a running instance"""
         mock_process = MockProcess()
@@ -195,6 +200,8 @@ def test_instance_stop(
 
         assert result["status"] == "terminated"
         assert result["instance_id"] == "test-id"
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert result[key] == val
         assert mock_process.terminated is True
 
     @patch("launcher.multiprocessing.Process")
@@ -203,10 +210,11 @@ def test_instance_stop_not_running(self, vllm_config, gpu_translator, tmp_log_di
         instance = VllmInstance(
             "test-id", vllm_config, gpu_translator, log_dir=tmp_log_dir
         )
-        result = instance.stop()
-
-        assert result["status"] == "not_running"
-        assert result["instance_id"] == "test-id"
+        try:
+            result = instance.stop()
+            assert False
+        except HalfStarted:
+            assert True
 
     @patch("launcher.os.killpg")
     @patch("launcher.multiprocessing.Process")
@@ -244,7 +252,7 @@ def join_side_effect(timeout=None):
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_get_status(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test getting instance status"""
         mock_process = MockProcess()
@@ -260,12 +268,16 @@ def test_instance_get_status(
         assert status["status"] == "running"
         assert status["options"] == vllm_config.options
         assert status["env_vars"] == vllm_config.env_vars
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert status[key] == val
 
         # Stopped
         mock_process._is_alive = False
         status = instance.get_status()
         assert status["status"] == "stopped"
         assert status["options"] == vllm_config.options
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert status[key] == val
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_uuid_to_index_translation(