More comprehensive update to instance status model

MikeSpreitzer · MikeSpreitzer · commit 716648c7c9d2 · 2026-03-24T15:50:34.000-04:00
Also: update the tests

Signed-off-by: Mike Spreitzer &lt;mspreitz@us.ibm.com&gt;
diff --git a/docs/launcher.md b/docs/launcher.md
@@ -152,6 +152,7 @@ Response:
 {
   "status": "started",
   "instance_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
+  "options": "--model facebook/opt-125m --port 8000"
 }
 ```
 
@@ -244,6 +245,7 @@ Create a new vLLM instance with an auto-generated UUID.
 ```json
 {
   "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
   "env_vars": {
     "VAR_NAME": "value"
   }
@@ -253,6 +255,7 @@ Create a new vLLM instance with an auto-generated UUID.
 **Parameters:**
 
 - `options` (required): Command-line options for vLLM
+- `gpu_uuids` (optional): List of GPU UUIDs
 - `env_vars` (optional): Dictionary of environment variables
 
 **Response (201 Created):**
@@ -261,6 +264,11 @@ Create a new vLLM instance with an auto-generated UUID.
 {
   "status": "started",
   "instance_id": "uuid-string",
+  "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
+  "env_vars": {
+    "VAR_NAME": "value"
+  }
 }
 ```
 
@@ -307,6 +315,11 @@ Stop and delete a specific vLLM instance.
 {
   "status": "terminated",
   "instance_id": "instance-id",
+  "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
+  "env_vars": {
+    "VAR_NAME": "value"
+  }
 }
 ```
 
@@ -408,14 +421,23 @@ Get status information for all instances. `Detail` is `True` by default.
     {
       "status": "running",
       "instance_id": "id-1",
+      "options": <options 1>,
+      "gpu_uuids": <gpus 1>,
+      "env_vars": <envars 1>
     },
     {
       "status": "stopped",
       "instance_id": "id-2",
+      "options": <options 2>,
+      "gpu_uuids": <gpus 2>,
+      "env_vars": <envars 2>
     },
     {
       "status": "running",
       "instance_id": "id-3",
+      "options": <options 3>,
+      "gpu_uuids": <gpus 3>,
+      "env_vars": <envars 3>
     }
   ]
 }
@@ -444,6 +466,11 @@ Get status information for a specific instance.
 {
   "status": "running",
   "instance_id": "instance-id",
+  "options": "--model MODEL_NAME --port PORT",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
+  "env_vars": {
+    "VAR_NAME": "value"
+  }
 }
 ```
 
@@ -510,6 +537,7 @@ curl -X POST http://localhost:8001/v2/vllm/instances \
   -H "Content-Type: application/json" \
   -d '{
     "options": "--model meta-llama/Llama-2-7b-hf --port 8000 --tensor-parallel-size 2",
+    "gpu_uuids": ["GPU-33", "GPU-86"],
     "env_vars": {
       "CUDA_VISIBLE_DEVICES": "0,1",
       "VLLM_ATTENTION_BACKEND": "FLASHINFER",
@@ -640,13 +668,15 @@ Pydantic model (data class) defining the configuration for a vLLM instance.
 **Attributes:**
 
 - `options` (str): Command-line options passed to vLLM (e.g., `"--model meta-llama/Llama-2-7b --port 8000"`)
+- `gpu_uuids` (Optional[List[str]]): UUIDs of GPUs
 - `env_vars` (Optional[Dict[str, Any]]): Environment variables to set for the vLLM process
 
 Ex:
 
 ```yaml
 {
   "options": "--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --port 8005",
+  "gpu_uuids": ["GPU-33", "GPU-86"],
   "env_vars": {
     "VLLM_USE_V1": "1",
     "VLLM_LOGGING_LEVEL": "DEBUG"
diff --git a/inference_server/launcher/launcher.py b/inference_server/launcher/launcher.py
@@ -61,6 +61,14 @@ class VllmConfig(BaseModel):
     env_vars: Optional[Dict[str, str]] = None
 
 
+class HalfMade(Exception):
+    """Raised when something other than start is the first op on a VllmInstance"""
+
+    def __init__(self, instance_id):
+        super().__init__()
+        self.instance_id = instance_id
+
+
 class VllmInstance:
     """Represents a single vLLM instance"""
 
@@ -106,13 +114,20 @@ def __init__(
             f"launcher-{os.getpid()}-vllm-{instance_id}.log",
         )
 
+    def _make_state(self, status: str) -> dict:
+        return {
+            "status": status,
+            "instance_id": self.instance_id,
+            **self.config.model_dump(exclude_none=True),
+        }
+
     def start(self) -> dict:
         """
         Start this vLLM instance
         :return: Status of the process.
         """
         if self.process and self.process.is_alive():
-            return {"status": "already_running", "instance_id": self.instance_id}
+            return self._make_state("already_running")
 
         # Create empty log file before spawning the child process
         open(self._log_file_path, "wb").close()
@@ -122,18 +137,17 @@ def start(self) -> dict:
         )
         self.process.start()
 
-        return {
-            "status": "started",
-            "instance_id": self.instance_id,
-        }
+        return self._make_state("started")
 
     def stop(self, timeout: int = 10) -> dict:
         """
         Stop existing vLLM instance
         :param timeout: waits for the process to stop, defaults to 10
         :return: a dictionary with the status "terminated"
         """
-        if not self.process or not self.process.is_alive():
+        if self.process is None:
+            raise HalfMade(self.instance_id)
+        if not self.process.is_alive():
             self._cleanup_log_file()
             return {
                 "status": "not_running",
@@ -156,10 +170,7 @@ def stop(self, timeout: int = 10) -> dict:
             self.process.join()
 
         self._cleanup_log_file()
-        return {
-            "status": "terminated",
-            "instance_id": self.instance_id,
-        }
+        return self._make_state("terminated")
 
     def _cleanup_log_file(self):
         """Remove the log file if it exists."""
@@ -173,12 +184,9 @@ def get_status(self) -> dict:
         Returns the status of the process
         :return: Status of the running process.
         """
-
-        return {
-            "status": "running" if self.process.is_alive() else "stopped",
-            "instance_id": self.instance_id,
-            **self.config.model_dump(),
-        }
+        if self.process is None:
+            raise HalfMade(self.instance_id)
+        return self._make_state("running" if self.process.is_alive() else "stopped")
 
     def get_log_bytes(
         self, start: int = 0, end: int | None = None
@@ -246,6 +254,9 @@ def create_instance(
         )
         self.instances[instance_id] = instance
 
+        # Because start() is always the first method called on a VllmInstance,
+        # the other methods of VllmMultiProcessManager do not need to handle
+        # the HalfMade exception.
         return instance.start()
 
     def stop_instance(self, instance_id: str, timeout: int = 10) -> dict:
diff --git a/inference_server/launcher/tests/test_launcher.py b/inference_server/launcher/tests/test_launcher.py
@@ -37,6 +37,7 @@
 # Import the application and classes
 from launcher import (  # noqa: E402
     MAX_LOG_RESPONSE_BYTES,
+    HalfMade,
     LogRangeNotAvailable,
     VllmConfig,
     VllmInstance,
@@ -147,7 +148,7 @@ def test_instance_creation(self, vllm_config, gpu_translator, tmp_log_dir):
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_start(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test starting a vLLM instance"""
         mock_process = MockProcess()
@@ -160,11 +161,13 @@ def test_instance_start(
 
         assert result["status"] == "started"
         assert result["instance_id"] == "test-id"
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert result[key] == val
         assert os.path.exists(instance._log_file_path)
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_start_already_running(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test starting an instance that's already running"""
         mock_process = MockProcess()
@@ -178,10 +181,12 @@ def test_instance_start_already_running(
 
         assert result["status"] == "already_running"
         assert result["instance_id"] == "test-id"
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert result[key] == val
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_stop(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test stopping a running instance"""
         mock_process = MockProcess()
@@ -195,6 +200,8 @@ def test_instance_stop(
 
         assert result["status"] == "terminated"
         assert result["instance_id"] == "test-id"
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert result[key] == val
         assert mock_process.terminated is True
 
     @patch("launcher.multiprocessing.Process")
@@ -203,10 +210,11 @@ def test_instance_stop_not_running(self, vllm_config, gpu_translator, tmp_log_di
         instance = VllmInstance(
             "test-id", vllm_config, gpu_translator, log_dir=tmp_log_dir
         )
-        result = instance.stop()
-
-        assert result["status"] == "not_running"
-        assert result["instance_id"] == "test-id"
+        try:
+            _ = instance.stop()
+            assert False
+        except HalfMade:
+            assert True
 
     @patch("launcher.os.killpg")
     @patch("launcher.multiprocessing.Process")
@@ -244,7 +252,7 @@ def join_side_effect(timeout=None):
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_get_status(
-        self, mock_process_class, vllm_config, gpu_translator, tmp_log_dir
+        self, mock_process_class, vllm_config: VllmConfig, gpu_translator, tmp_log_dir
     ):
         """Test getting instance status"""
         mock_process = MockProcess()
@@ -258,14 +266,15 @@ def test_instance_get_status(
         instance.start()
         status = instance.get_status()
         assert status["status"] == "running"
-        assert status["options"] == vllm_config.options
-        assert status["env_vars"] == vllm_config.env_vars
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert status[key] == val
 
         # Stopped
         mock_process._is_alive = False
         status = instance.get_status()
         assert status["status"] == "stopped"
-        assert status["options"] == vllm_config.options
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert status[key] == val
 
     @patch("launcher.multiprocessing.Process")
     def test_instance_uuid_to_index_translation(
@@ -466,7 +475,7 @@ def test_stop_all_instances(self, mock_process_class, manager, vllm_config):
         assert len(manager.instances) == 0
 
     @patch("launcher.multiprocessing.Process")
-    def test_get_instance_status(self, mock_process_class, manager, vllm_config):
+    def test_get_instance_status(self, mock_process_class, manager, vllm_config: VllmConfig):
         """Test getting status of specific instance"""
         mock_process = MockProcess()
         mock_process_class.return_value = mock_process
@@ -476,8 +485,8 @@ def test_get_instance_status(self, mock_process_class, manager, vllm_config):
 
         assert status["status"] == "running"
         assert status["instance_id"] == "test-id"
-        assert status["options"] == vllm_config.options
-        assert status["env_vars"] == vllm_config.env_vars
+        for key, val in vllm_config.model_dump(exclude_none=True).items():
+            assert status[key] == val
 
     @patch("launcher.multiprocessing.Process")
     def test_get_instance_status_nonexistent(self, mock_process_class, manager):
@@ -500,8 +509,8 @@ def test_get_all_instances_status(self, mock_process_class, manager, vllm_config
         assert status["running_instances"] == 2
         assert len(status["instances"]) == 2
         for inst in status["instances"]:
-            assert inst["options"] == vllm_config.options
-            assert inst["env_vars"] == vllm_config.env_vars
+            for key, val in vllm_config.model_dump(exclude_none=True).items():
+                assert inst[key] == val
 
     @patch("launcher.multiprocessing.Process")
     def test_list_instances(self, mock_process_class, manager, vllm_config):
@@ -1073,19 +1082,6 @@ def test_stop_terminated_cleans_up_log_file(
         instance.stop()
         assert not os.path.exists(instance._log_file_path)
 
-    @patch("launcher.multiprocessing.Process")
-    def test_stop_not_running_cleans_up_log_file(
-        self, mock_process_class, gpu_translator, tmp_log_dir
-    ):
-        """Test that stop() removes the log file when process is not running"""
-        instance = self._make_instance(gpu_translator, tmp_log_dir)
-        # Create a log file manually
-        open(instance._log_file_path, "wb").close()
-        assert os.path.exists(instance._log_file_path)
-
-        instance.stop()
-        assert not os.path.exists(instance._log_file_path)
-
     def test_cleanup_missing_file_no_error(self, gpu_translator, tmp_log_dir):
         """Test that _cleanup_log_file does not raise if file doesn't exist"""
         instance = self._make_instance(gpu_translator, tmp_log_dir)