@@ -65,10 +65,15 @@ type KrknScenarioRunReconciler struct {
6565func (r * KrknScenarioRunReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
6666 logger := log .FromContext (ctx )
6767
68+ logger .Info ("reconcile loop started" ,
69+ "scenarioRun" , req .Name ,
70+ "namespace" , req .Namespace )
71+
6872 // Fetch the KrknScenarioRun instance
6973 var scenarioRun krknv1alpha1.KrknScenarioRun
7074 if err := r .Get (ctx , req .NamespacedName , & scenarioRun ); err != nil {
7175 if apierrors .IsNotFound (err ) {
76+ logger .Info ("scenarioRun not found, probably deleted" , "scenarioRun" , req .Name )
7277 return ctrl.Result {}, nil
7378 }
7479 logger .Error (err , "unable to fetch KrknScenarioRun" )
@@ -77,6 +82,11 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
7782
7883 // Initialize status if first reconcile
7984 if scenarioRun .Status .Phase == "" {
85+ logger .Info ("initializing scenarioRun status" ,
86+ "scenarioRun" , scenarioRun .Name ,
87+ "totalTargets" , len (scenarioRun .Spec .ClusterNames ),
88+ "clusters" , scenarioRun .Spec .ClusterNames )
89+
8090 scenarioRun .Status .Phase = "Pending"
8191 scenarioRun .Status .TotalTargets = len (scenarioRun .Spec .ClusterNames )
8292 scenarioRun .Status .ClusterJobs = make ([]krknv1alpha1.ClusterJobStatus , 0 )
@@ -87,19 +97,41 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
8797 }
8898
8999 // Process each cluster
100+ jobsCreated := 0
90101 for _ , clusterName := range scenarioRun .Spec .ClusterNames {
91102 // Check if job already exists for this cluster
92103 if r .jobExistsForCluster (& scenarioRun , clusterName ) {
104+ logger .V (1 ).Info ("job already exists for cluster, skipping" ,
105+ "cluster" , clusterName ,
106+ "scenarioRun" , scenarioRun .Name )
93107 continue
94108 }
95109
110+ logger .Info ("creating job for cluster" ,
111+ "cluster" , clusterName ,
112+ "scenarioRun" , scenarioRun .Name )
113+
96114 // Create new job for this cluster
97115 if err := r .createClusterJob (ctx , & scenarioRun , clusterName ); err != nil {
98- logger .Error (err , "failed to create cluster job" , "cluster" , clusterName )
116+ logger .Error (err , "failed to create cluster job" ,
117+ "cluster" , clusterName ,
118+ "scenarioRun" , scenarioRun .Name )
99119 // Continue with best-effort approach for other clusters
120+ } else {
121+ jobsCreated ++
100122 }
101123 }
102124
125+ if jobsCreated > 0 {
126+ logger .Info ("jobs created in this reconcile loop" ,
127+ "count" , jobsCreated ,
128+ "scenarioRun" , scenarioRun .Name )
129+ }
130+
131+ logger .V (1 ).Info ("updating cluster job statuses" ,
132+ "scenarioRun" , scenarioRun .Name ,
133+ "totalJobs" , len (scenarioRun .Status .ClusterJobs ))
134+
103135 // Update status for all jobs
104136 if err := r .updateClusterJobStatuses (ctx , & scenarioRun ); err != nil {
105137 logger .Error (err , "failed to update cluster job statuses" )
@@ -109,6 +141,14 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
109141 // Calculate overall status
110142 r .calculateOverallStatus (& scenarioRun )
111143
144+ logger .Info ("reconcile loop completed" ,
145+ "scenarioRun" , scenarioRun .Name ,
146+ "phase" , scenarioRun .Status .Phase ,
147+ "totalTargets" , scenarioRun .Status .TotalTargets ,
148+ "successfulJobs" , scenarioRun .Status .SuccessfulJobs ,
149+ "failedJobs" , scenarioRun .Status .FailedJobs ,
150+ "runningJobs" , scenarioRun .Status .RunningJobs )
151+
112152 // Update status
113153 if err := r .Status ().Update (ctx , & scenarioRun ); err != nil {
114154 logger .Error (err , "failed to update status" )
@@ -117,6 +157,9 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
117157
118158 // Requeue if jobs still running
119159 if scenarioRun .Status .RunningJobs > 0 {
160+ logger .V (1 ).Info ("requeuing because jobs still running" ,
161+ "scenarioRun" , scenarioRun .Name ,
162+ "runningJobs" , scenarioRun .Status .RunningJobs )
120163 return ctrl.Result {RequeueAfter : 10 * time .Second }, nil
121164 }
122165
@@ -482,13 +525,28 @@ func (r *KrknScenarioRunReconciler) updateClusterJobStatuses(
482525 for i := range scenarioRun .Status .ClusterJobs {
483526 job := & scenarioRun .Status .ClusterJobs [i ]
484527
528+ logger .V (1 ).Info ("checking job status" ,
529+ "cluster" , job .ClusterName ,
530+ "jobId" , job .JobId ,
531+ "currentPhase" , job .Phase ,
532+ "podName" , job .PodName )
533+
485534 // Skip terminal jobs
486535 if job .Phase == "Succeeded" || job .Phase == "Cancelled" || job .Phase == "MaxRetriesExceeded" {
536+ logger .V (1 ).Info ("skipping terminal job" ,
537+ "cluster" , job .ClusterName ,
538+ "jobId" , job .JobId ,
539+ "phase" , job .Phase )
487540 continue
488541 }
489542
490543 // Skip Failed jobs unless they need retry processing
491544 if job .Phase == "Failed" && job .RetryCount >= job .MaxRetries && ! job .CancelRequested {
545+ logger .V (1 ).Info ("skipping failed job that exceeded retries" ,
546+ "cluster" , job .ClusterName ,
547+ "jobId" , job .JobId ,
548+ "retryCount" , job .RetryCount ,
549+ "maxRetries" , job .MaxRetries )
492550 continue
493551 }
494552
@@ -501,23 +559,79 @@ func (r *KrknScenarioRunReconciler) updateClusterJobStatuses(
501559
502560 if err != nil {
503561 if apierrors .IsNotFound (err ) {
562+ // IMPORTANT: Don't mark as Failed if pod was just created
563+ // Kubernetes might not have created the pod yet
564+ if job .Phase == "Pending" {
565+ // Calculate time since job start
566+ if job .StartTime != nil {
567+ timeSinceStart := time .Since (job .StartTime .Time )
568+ if timeSinceStart < 30 * time .Second {
569+ // Pod not found but job is recent - this is normal, keep waiting
570+ logger .V (1 ).Info ("pod not found but job is recent, keeping Pending status" ,
571+ "cluster" , job .ClusterName ,
572+ "jobId" , job .JobId ,
573+ "podName" , job .PodName ,
574+ "timeSinceStart" , timeSinceStart .String ())
575+ continue
576+ }
577+ }
578+ }
579+
580+ // Pod genuinely not found - this is an error
581+ logger .Info ("pod not found for job" ,
582+ "cluster" , job .ClusterName ,
583+ "jobId" , job .JobId ,
584+ "podName" , job .PodName ,
585+ "currentPhase" , job .Phase )
586+
504587 job .Phase = "Failed"
505588 job .Message = "Pod not found"
589+ job .FailureReason = "PodNotFound"
506590 now := metav1 .Now ()
507591 job .CompletionTime = & now
592+ } else {
593+ logger .Error (err , "error fetching pod" ,
594+ "cluster" , job .ClusterName ,
595+ "jobId" , job .JobId ,
596+ "podName" , job .PodName )
508597 }
509598 continue
510599 }
511600
601+ logger .V (1 ).Info ("pod found" ,
602+ "cluster" , job .ClusterName ,
603+ "jobId" , job .JobId ,
604+ "podName" , job .PodName ,
605+ "podPhase" , pod .Status .Phase )
606+
512607 // Update job status based on pod phase
608+ previousPhase := job .Phase
513609 switch pod .Status .Phase {
514610 case corev1 .PodPending :
515611 job .Phase = "Pending"
612+ if previousPhase != "Pending" {
613+ logger .Info ("job phase transition" ,
614+ "cluster" , job .ClusterName ,
615+ "jobId" , job .JobId ,
616+ "from" , previousPhase ,
617+ "to" , "Pending" )
618+ }
516619 case corev1 .PodRunning :
517620 job .Phase = "Running"
621+ if previousPhase != "Running" {
622+ logger .Info ("job phase transition" ,
623+ "cluster" , job .ClusterName ,
624+ "jobId" , job .JobId ,
625+ "from" , previousPhase ,
626+ "to" , "Running" )
627+ }
518628 case corev1 .PodSucceeded :
519629 job .Phase = "Succeeded"
520630 r .setCompletionTime (job )
631+ logger .Info ("job succeeded" ,
632+ "cluster" , job .ClusterName ,
633+ "jobId" , job .JobId ,
634+ "duration" , job .CompletionTime .Sub (job .StartTime .Time ).String ())
521635 case corev1 .PodFailed :
522636 job .Phase = "Failed"
523637 job .Message = r .extractPodErrorMessage (& pod )
@@ -598,7 +712,12 @@ func (r *KrknScenarioRunReconciler) updateClusterJobStatuses(
598712 case corev1 .PodUnknown :
599713 job .Phase = "Failed"
600714 job .Message = "Pod in unknown state"
715+ job .FailureReason = "PodUnknown"
601716 r .setCompletionTime (job )
717+ logger .Info ("pod in unknown state" ,
718+ "cluster" , job .ClusterName ,
719+ "jobId" , job .JobId ,
720+ "podName" , job .PodName )
602721 }
603722 }
604723
0 commit comments