From 87c34aee475808956646d49edeceff0a17f712ef Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Thu, 2 Oct 2025 10:26:53 -0700 Subject: [PATCH] Surface pod container status for debugging --- packages/k8s/src/k8s/index.ts | 37 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/packages/k8s/src/k8s/index.ts b/packages/k8s/src/k8s/index.ts index c956bc06..1c3db98e 100644 --- a/packages/k8s/src/k8s/index.ts +++ b/packages/k8s/src/k8s/index.ts @@ -480,10 +480,12 @@ export async function waitForPodPhases( maxTimeSeconds = DEFAULT_WAIT_FOR_POD_TIME_SECONDS ): Promise { const backOffManager = new BackOffManager(maxTimeSeconds) + let podStatus: k8s.V1PodStatus | undefined = undefined let phase: PodPhase = PodPhase.UNKNOWN try { while (true) { - phase = await getPodPhase(podName) + podStatus = await getPodStatus(podName) + phase = getPodPhaseFromStatus(podStatus) if (awaitingPhases.has(phase)) { return } @@ -496,7 +498,13 @@ export async function waitForPodPhases( await backOffManager.backOff() } } catch (error) { - throw new Error(`Pod ${podName} is unhealthy with phase status ${phase}`) + throw new Error( + `Pod ${podName} is unhealthy with phase status ${phase}. Pod message is ${ + podStatus?.message + } and pod's container statuses are ${JSON.stringify( + podStatus?.containerStatuses || '' + )}` + ) } } @@ -519,22 +527,19 @@ export function getPrepareJobTimeoutSeconds(): number { return timeoutSeconds } -async function getPodPhase(podName: string): Promise { - const podPhaseLookup = new Set([ - PodPhase.PENDING, - PodPhase.RUNNING, - PodPhase.SUCCEEDED, - PodPhase.FAILED, - PodPhase.UNKNOWN - ]) - const pod = await k8sApi.readNamespacedPod({ - name: podName, - namespace: namespace() - }) - if (!pod.status?.phase || !podPhaseLookup.has(pod.status.phase)) { +const podPhaseLookup = new Set([ + PodPhase.PENDING, + PodPhase.RUNNING, + PodPhase.SUCCEEDED, + PodPhase.FAILED, + PodPhase.UNKNOWN +]) + +function getPodPhaseFromStatus(podStatus?: k8s.V1PodStatus): PodPhase { + if (!podStatus || !podStatus.phase || !podPhaseLookup.has(podStatus.phase)) { return PodPhase.UNKNOWN } - return pod.status?.phase as PodPhase + return podStatus.phase as PodPhase } async function isJobSucceeded(jobName: string): Promise {