Skip to content

Commit a65ea3a

Browse files
yonromaiclaude
andcommitted
coreweave: add deployment wait and tunnel failure diagnostics
Log controller pod phase/node on state transitions during the ~14-min deployment wait (replaces ~28 identical availableReplicas=0 lines with ~3-5 meaningful ones). On tunnel failure, dump kube-system pods to capture konnectivity-agent state before events TTL. Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent afa188c commit a65ea3a

1 file changed

Lines changed: 24 additions & 2 deletions

File tree

lib/iris/src/iris/cluster/platform/coreweave.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,7 @@ def _wait_for_deployment_ready(self) -> None:
11631163
deadline = Deadline.from_seconds(_DEPLOYMENT_READY_TIMEOUT)
11641164
last_status_log = 0.0
11651165
status_log_interval = 30.0 # log progress every 30s
1166+
prev_pod_state: tuple[str, str] | None = None # (phase, node)
11661167

11671168
while not self._shutdown_event.is_set():
11681169
if deadline.expired():
@@ -1187,7 +1188,18 @@ def _wait_for_deployment_ready(self) -> None:
11871188
)
11881189

11891190
# Check Pods owned by this Deployment for fatal errors
1190-
self._check_controller_pods_health()
1191+
pods = self._check_controller_pods_health()
1192+
1193+
# Log pod phase/node on transitions only — distinguishes
1194+
# node-provisioning vs image-pull vs readiness-probe time.
1195+
if pods:
1196+
pod = pods[0]
1197+
phase = pod.get("status", {}).get("phase", "Unknown")
1198+
node = pod.get("spec", {}).get("nodeName") or "<none>"
1199+
pod_state = (phase, node)
1200+
if pod_state != prev_pod_state:
1201+
logger.info("Controller pod: phase=%s node=%s", phase, node)
1202+
prev_pod_state = pod_state
11911203

11921204
self._shutdown_event.wait(self._poll_interval)
11931205
raise PlatformError("Platform shutting down while waiting for controller Deployment")
@@ -1232,7 +1244,7 @@ def debug_report(self) -> None:
12321244
if prev_logs:
12331245
logger.warning("Post-mortem %s previous logs:\n%s", name, prev_logs)
12341246

1235-
def _check_controller_pods_health(self) -> None:
1247+
def _check_controller_pods_health(self) -> list[dict]:
12361248
"""Check controller Pods for fatal conditions and fail fast.
12371249
12381250
Detects three categories of unrecoverable failure:
@@ -1276,6 +1288,7 @@ def _check_controller_pods_health(self) -> None:
12761288
logger.info("Controller Pod %s not ready: %s: %s", pod_name, cond_reason, cond_message)
12771289

12781290
self._check_controller_pod_events()
1291+
return pods
12791292

12801293
# Known-fatal event reasons that will never self-resolve
12811294
_FATAL_EVENT_REASONS = frozenset(
@@ -1481,6 +1494,15 @@ def _coreweave_tunnel(
14811494
if proc is not None:
14821495
proc.terminate()
14831496
proc.wait()
1497+
# Capture konnectivity-agent state — it lives in kube-system and is
1498+
# invisible to normal pod-scoped queries. Without this, diagnosing
1499+
# the tunnel race requires manual kubectl before events TTL (~1h).
1500+
try:
1501+
result = kubectl.run(["get", "pods", "-n", "kube-system", "-o", "wide"], timeout=10)
1502+
if result.returncode == 0:
1503+
logger.warning("kube-system pods at tunnel failure:\n%s", result.stdout.strip())
1504+
except subprocess.TimeoutExpired:
1505+
pass
14841506
raise RuntimeError(f"kubectl port-forward to {service_name}:{remote_port} failed after {timeout}s")
14851507

14861508
try:

0 commit comments

Comments
 (0)