Skip to content

Commit 5a1a9f8

Browse files
committed
Check the readiness of inference-server container
Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
1 parent 3f4b6cb commit 5a1a9f8

File tree

1 file changed

+32
-2
lines changed

1 file changed

+32
-2
lines changed

inference_server/launcher/launcher_pod_notifier.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
DEFAULT_BASE_URL = "http://127.0.0.1:8001"
3434
DEFAULT_POLL_INTERVAL_SECONDS = 2.0
3535
DEFAULT_ERROR_BACKOFF_SECONDS = 5.0
36+
INFERENCE_SERVER_CONTAINER_NAME = "inference-server"
3637

3738

3839
logger = logging.getLogger("launcher_pod_notifier")
@@ -52,7 +53,33 @@ def get_required_env(name: str) -> str:
5253
return value
5354

5455

55-
def fetch_launcher_state(base_url: str) -> dict[str, Any]:
56+
def is_inference_server_ready(
57+
api: client.CoreV1Api, namespace: str, pod_name: str
58+
) -> bool:
59+
"""Check if the inference-server container is ready in the pod."""
60+
try:
61+
pod = api.read_namespaced_pod(name=pod_name, namespace=namespace)
62+
if not pod.status or not pod.status.container_statuses:
63+
return False
64+
65+
for container_status in pod.status.container_statuses:
66+
if container_status.name == INFERENCE_SERVER_CONTAINER_NAME:
67+
return container_status.ready or False
68+
69+
# inference-server container not found
70+
return False
71+
except Exception as exc:
72+
logger.warning("Failed to check inference-server readiness: %s", exc)
73+
return False
74+
75+
76+
def fetch_launcher_state(
77+
base_url: str, api: client.CoreV1Api, namespace: str, pod_name: str
78+
) -> dict[str, Any]:
79+
"""Fetch launcher state only if inference-server container is ready."""
80+
if not is_inference_server_ready(api, namespace, pod_name):
81+
raise RuntimeError("inference-server container is not ready yet")
82+
5683
url = f"{base_url}/v2/vllm/instances"
5784
with urllib.request.urlopen(url, timeout=5) as response:
5885
payload = json.load(response)
@@ -142,14 +169,17 @@ def main() -> int:
142169

143170
while True:
144171
try:
145-
signature = compute_signature(fetch_launcher_state(base_url))
172+
signature = compute_signature(
173+
fetch_launcher_state(base_url, api, namespace, pod_name)
174+
)
146175
if signature != last_published_signature:
147176
patch_pod_signature(api, namespace, pod_name, signature)
148177
last_published_signature = signature
149178
time.sleep(poll_interval)
150179
except (
151180
ApiException,
152181
OSError,
182+
RuntimeError,
153183
TimeoutError,
154184
ValueError,
155185
urllib.error.HTTPError,

0 commit comments

Comments
 (0)