@@ -465,6 +465,87 @@ export class KubeClient {
465465 }
466466 }
467467
468+ /**
469+ * Check if pods are in a failure state (CrashLoopBackOff, ImagePullBackOff, etc.)
470+ * Returns a failure reason if found, null otherwise
471+ */
472+ async checkPodFailureStates (
473+ namespace : string ,
474+ labelSelector : string ,
475+ ) : Promise < string | null > {
476+ try {
477+ const response = await this . coreV1Api . listNamespacedPod (
478+ namespace ,
479+ undefined ,
480+ undefined ,
481+ undefined ,
482+ undefined ,
483+ labelSelector ,
484+ ) ;
485+
486+ const pods = response . body . items ;
487+ if ( pods . length === 0 ) {
488+ return null ; // No pods yet, not a failure
489+ }
490+
491+ for ( const pod of pods ) {
492+ const podName = pod . metadata ?. name || "unknown" ;
493+ const phase = pod . status ?. phase ;
494+
495+ // Check for Failed phase
496+ if ( phase === "Failed" ) {
497+ const reason = pod . status ?. reason || "Unknown" ;
498+ const message = pod . status ?. message || "" ;
499+ return `Pod ${ podName } is in Failed phase: ${ reason } - ${ message } ` ;
500+ }
501+
502+ // Check container statuses for failure states
503+ const containerStatuses = [
504+ ...( pod . status ?. containerStatuses || [ ] ) ,
505+ ...( pod . status ?. initContainerStatuses || [ ] ) ,
506+ ] ;
507+
508+ for ( const containerStatus of containerStatuses ) {
509+ const containerName = containerStatus . name ;
510+ const waiting = containerStatus . state ?. waiting ;
511+
512+ if ( waiting ) {
513+ const reason = waiting . reason || "" ;
514+ // Check for common failure states
515+ const failureStates = [
516+ "CrashLoopBackOff" ,
517+ "ImagePullBackOff" ,
518+ "ErrImagePull" ,
519+ "InvalidImageName" ,
520+ "CreateContainerConfigError" ,
521+ "CreateContainerError" ,
522+ ] ;
523+
524+ if ( failureStates . includes ( reason ) ) {
525+ const message = waiting . message || "" ;
526+ return `Pod ${ podName } container ${ containerName } is in ${ reason } state: ${ message } ` ;
527+ }
528+ }
529+
530+ // Check for containers that have terminated with errors
531+ const terminated = containerStatus . state ?. terminated ;
532+ if ( terminated && terminated . exitCode !== 0 ) {
533+ const reason = terminated . reason || "Error" ;
534+ const message = terminated . message || "" ;
535+ console . warn (
536+ `Pod ${ podName } container ${ containerName } terminated with exit code ${ terminated . exitCode } : ${ reason } - ${ message } ` ,
537+ ) ;
538+ }
539+ }
540+ }
541+
542+ return null ; // No failure states detected
543+ } catch ( error ) {
544+ console . error ( `Error checking pod failure states: ${ error } ` ) ;
545+ return null ; // Don't fail the check if we can't retrieve pod info
546+ }
547+ }
548+
468549 async waitForDeploymentReady (
469550 deploymentName : string ,
470551 namespace : string ,
@@ -492,6 +573,23 @@ export class KubeClient {
492573 JSON . stringify ( conditions , null , 2 ) ,
493574 ) ;
494575
576+ // Check for pod failure states when expecting replicas > 0
577+ if ( expectedReplicas > 0 ) {
578+ const podFailureReason = await this . checkPodFailureStates (
579+ namespace ,
580+ labelSelector ,
581+ ) ;
582+ if ( podFailureReason ) {
583+ console . error (
584+ `Pod failure detected: ${ podFailureReason } . Logging events and pod logs...` ,
585+ ) ;
586+ await this . logDeploymentEvents ( deploymentName , namespace ) ;
587+ throw new Error (
588+ `Deployment ${ deploymentName } failed to start: ${ podFailureReason } ` ,
589+ ) ;
590+ }
591+ }
592+
495593 // Log pod conditions using label selector
496594 await this . logPodConditions ( namespace , labelSelector ) ;
497595
@@ -508,11 +606,17 @@ export class KubeClient {
508606 ) ;
509607 } catch ( error ) {
510608 console . error ( `Error checking deployment status: ${ error } ` ) ;
609+ // If we threw an error about pod failure, re-throw it
610+ if ( error . message ?. includes ( "failed to start" ) ) {
611+ throw error ;
612+ }
511613 }
512614
513615 await new Promise ( ( resolve ) => setTimeout ( resolve , checkInterval ) ) ;
514616 }
515617
618+ // On timeout, collect final diagnostics
619+ await this . logDeploymentEvents ( deploymentName , namespace ) ;
516620 throw new Error (
517621 `Deployment ${ deploymentName } did not become ready in time (timeout: ${ timeout / 1000 } s).` ,
518622 ) ;
0 commit comments