@@ -1163,6 +1163,7 @@ def _wait_for_deployment_ready(self) -> None:
11631163 deadline = Deadline .from_seconds (_DEPLOYMENT_READY_TIMEOUT )
11641164 last_status_log = 0.0
11651165 status_log_interval = 30.0 # log progress every 30s
1166+ prev_pod_state : tuple [str , str ] | None = None # (phase, node)
11661167
11671168 while not self ._shutdown_event .is_set ():
11681169 if deadline .expired ():
@@ -1187,7 +1188,18 @@ def _wait_for_deployment_ready(self) -> None:
11871188 )
11881189
11891190 # Check Pods owned by this Deployment for fatal errors
1190- self ._check_controller_pods_health ()
1191+ pods = self ._check_controller_pods_health ()
1192+
1193+ # Log pod phase/node on transitions only — distinguishes
1194+ # node-provisioning vs image-pull vs readiness-probe time.
1195+ if pods :
1196+ pod = pods [0 ]
1197+ phase = pod .get ("status" , {}).get ("phase" , "Unknown" )
1198+ node = pod .get ("spec" , {}).get ("nodeName" ) or "<none>"
1199+ pod_state = (phase , node )
1200+ if pod_state != prev_pod_state :
1201+ logger .info ("Controller pod: phase=%s node=%s" , phase , node )
1202+ prev_pod_state = pod_state
11911203
11921204 self ._shutdown_event .wait (self ._poll_interval )
11931205 raise PlatformError ("Platform shutting down while waiting for controller Deployment" )
@@ -1232,7 +1244,7 @@ def debug_report(self) -> None:
12321244 if prev_logs :
12331245 logger .warning ("Post-mortem %s previous logs:\n %s" , name , prev_logs )
12341246
1235- def _check_controller_pods_health (self ) -> None :
1247+ def _check_controller_pods_health (self ) -> list [ dict ] :
12361248 """Check controller Pods for fatal conditions and fail fast.
12371249
12381250 Detects three categories of unrecoverable failure:
@@ -1276,6 +1288,7 @@ def _check_controller_pods_health(self) -> None:
12761288 logger .info ("Controller Pod %s not ready: %s: %s" , pod_name , cond_reason , cond_message )
12771289
12781290 self ._check_controller_pod_events ()
1291+ return pods
12791292
12801293 # Known-fatal event reasons that will never self-resolve
12811294 _FATAL_EVENT_REASONS = frozenset (
@@ -1481,6 +1494,15 @@ def _coreweave_tunnel(
14811494 if proc is not None :
14821495 proc .terminate ()
14831496 proc .wait ()
1497+ # Capture konnectivity-agent state — it lives in kube-system and is
1498+ # invisible to normal pod-scoped queries. Without this, diagnosing
1499+ # the tunnel race requires manual kubectl before events TTL (~1h).
1500+ try :
1501+ result = kubectl .run (["get" , "pods" , "-n" , "kube-system" , "-o" , "wide" ], timeout = 10 )
1502+ if result .returncode == 0 :
1503+ logger .warning ("kube-system pods at tunnel failure:\n %s" , result .stdout .strip ())
1504+ except subprocess .TimeoutExpired :
1505+ pass
14841506 raise RuntimeError (f"kubectl port-forward to { service_name } :{ remote_port } failed after { timeout } s" )
14851507
14861508 try :
0 commit comments