3333DEFAULT_BASE_URL = "http://127.0.0.1:8001"
3434DEFAULT_POLL_INTERVAL_SECONDS = 2.0
3535DEFAULT_ERROR_BACKOFF_SECONDS = 5.0
36+ INFERENCE_SERVER_CONTAINER_NAME = "inference-server"
3637
3738
3839logger = logging .getLogger ("launcher_pod_notifier" )
@@ -52,7 +53,33 @@ def get_required_env(name: str) -> str:
5253 return value
5354
5455
55- def fetch_launcher_state (base_url : str ) -> dict [str , Any ]:
56+ def is_inference_server_ready (
57+ api : client .CoreV1Api , namespace : str , pod_name : str
58+ ) -> bool :
59+ """Check if the inference-server container is ready in the pod."""
60+ try :
61+ pod = api .read_namespaced_pod (name = pod_name , namespace = namespace )
62+ if not pod .status or not pod .status .container_statuses :
63+ return False
64+
65+ for container_status in pod .status .container_statuses :
66+ if container_status .name == INFERENCE_SERVER_CONTAINER_NAME :
67+ return container_status .ready or False
68+
69+ # inference-server container not found
70+ return False
71+ except Exception as exc :
72+ logger .warning ("Failed to check inference-server readiness: %s" , exc )
73+ return False
74+
75+
76+ def fetch_launcher_state (
77+ base_url : str , api : client .CoreV1Api , namespace : str , pod_name : str
78+ ) -> dict [str , Any ]:
79+ """Fetch launcher state only if inference-server container is ready."""
80+ if not is_inference_server_ready (api , namespace , pod_name ):
81+ raise RuntimeError ("inference-server container is not ready yet" )
82+
5683 url = f"{ base_url } /v2/vllm/instances"
5784 with urllib .request .urlopen (url , timeout = 5 ) as response :
5885 payload = json .load (response )
@@ -142,14 +169,17 @@ def main() -> int:
142169
143170 while True :
144171 try :
145- signature = compute_signature (fetch_launcher_state (base_url ))
172+ signature = compute_signature (
173+ fetch_launcher_state (base_url , api , namespace , pod_name )
174+ )
146175 if signature != last_published_signature :
147176 patch_pod_signature (api , namespace , pod_name , signature )
148177 last_published_signature = signature
149178 time .sleep (poll_interval )
150179 except (
151180 ApiException ,
152181 OSError ,
182+ RuntimeError ,
153183 TimeoutError ,
154184 ValueError ,
155185 urllib .error .HTTPError ,
0 commit comments