Skip to content

Commit ccb6d73

Browse files
committed
reconcile loop fix
1 parent 82a74a8 commit ccb6d73

File tree

1 file changed

+120
-1
lines changed

1 file changed

+120
-1
lines changed

internal/controller/krknscenariorun_controller.go

Lines changed: 120 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,15 @@ type KrknScenarioRunReconciler struct {
6565
func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
6666
logger := log.FromContext(ctx)
6767

68+
logger.Info("reconcile loop started",
69+
"scenarioRun", req.Name,
70+
"namespace", req.Namespace)
71+
6872
// Fetch the KrknScenarioRun instance
6973
var scenarioRun krknv1alpha1.KrknScenarioRun
7074
if err := r.Get(ctx, req.NamespacedName, &scenarioRun); err != nil {
7175
if apierrors.IsNotFound(err) {
76+
logger.Info("scenarioRun not found, probably deleted", "scenarioRun", req.Name)
7277
return ctrl.Result{}, nil
7378
}
7479
logger.Error(err, "unable to fetch KrknScenarioRun")
@@ -77,6 +82,11 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
7782

7883
// Initialize status if first reconcile
7984
if scenarioRun.Status.Phase == "" {
85+
logger.Info("initializing scenarioRun status",
86+
"scenarioRun", scenarioRun.Name,
87+
"totalTargets", len(scenarioRun.Spec.ClusterNames),
88+
"clusters", scenarioRun.Spec.ClusterNames)
89+
8090
scenarioRun.Status.Phase = "Pending"
8191
scenarioRun.Status.TotalTargets = len(scenarioRun.Spec.ClusterNames)
8292
scenarioRun.Status.ClusterJobs = make([]krknv1alpha1.ClusterJobStatus, 0)
@@ -87,19 +97,41 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
8797
}
8898

8999
// Process each cluster
100+
jobsCreated := 0
90101
for _, clusterName := range scenarioRun.Spec.ClusterNames {
91102
// Check if job already exists for this cluster
92103
if r.jobExistsForCluster(&scenarioRun, clusterName) {
104+
logger.V(1).Info("job already exists for cluster, skipping",
105+
"cluster", clusterName,
106+
"scenarioRun", scenarioRun.Name)
93107
continue
94108
}
95109

110+
logger.Info("creating job for cluster",
111+
"cluster", clusterName,
112+
"scenarioRun", scenarioRun.Name)
113+
96114
// Create new job for this cluster
97115
if err := r.createClusterJob(ctx, &scenarioRun, clusterName); err != nil {
98-
logger.Error(err, "failed to create cluster job", "cluster", clusterName)
116+
logger.Error(err, "failed to create cluster job",
117+
"cluster", clusterName,
118+
"scenarioRun", scenarioRun.Name)
99119
// Continue with best-effort approach for other clusters
120+
} else {
121+
jobsCreated++
100122
}
101123
}
102124

125+
if jobsCreated > 0 {
126+
logger.Info("jobs created in this reconcile loop",
127+
"count", jobsCreated,
128+
"scenarioRun", scenarioRun.Name)
129+
}
130+
131+
logger.V(1).Info("updating cluster job statuses",
132+
"scenarioRun", scenarioRun.Name,
133+
"totalJobs", len(scenarioRun.Status.ClusterJobs))
134+
103135
// Update status for all jobs
104136
if err := r.updateClusterJobStatuses(ctx, &scenarioRun); err != nil {
105137
logger.Error(err, "failed to update cluster job statuses")
@@ -109,6 +141,14 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
109141
// Calculate overall status
110142
r.calculateOverallStatus(&scenarioRun)
111143

144+
logger.Info("reconcile loop completed",
145+
"scenarioRun", scenarioRun.Name,
146+
"phase", scenarioRun.Status.Phase,
147+
"totalTargets", scenarioRun.Status.TotalTargets,
148+
"successfulJobs", scenarioRun.Status.SuccessfulJobs,
149+
"failedJobs", scenarioRun.Status.FailedJobs,
150+
"runningJobs", scenarioRun.Status.RunningJobs)
151+
112152
// Update status
113153
if err := r.Status().Update(ctx, &scenarioRun); err != nil {
114154
logger.Error(err, "failed to update status")
@@ -117,6 +157,9 @@ func (r *KrknScenarioRunReconciler) Reconcile(ctx context.Context, req ctrl.Requ
117157

118158
// Requeue if jobs still running
119159
if scenarioRun.Status.RunningJobs > 0 {
160+
logger.V(1).Info("requeuing because jobs still running",
161+
"scenarioRun", scenarioRun.Name,
162+
"runningJobs", scenarioRun.Status.RunningJobs)
120163
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
121164
}
122165

@@ -482,13 +525,28 @@ func (r *KrknScenarioRunReconciler) updateClusterJobStatuses(
482525
for i := range scenarioRun.Status.ClusterJobs {
483526
job := &scenarioRun.Status.ClusterJobs[i]
484527

528+
logger.V(1).Info("checking job status",
529+
"cluster", job.ClusterName,
530+
"jobId", job.JobId,
531+
"currentPhase", job.Phase,
532+
"podName", job.PodName)
533+
485534
// Skip terminal jobs
486535
if job.Phase == "Succeeded" || job.Phase == "Cancelled" || job.Phase == "MaxRetriesExceeded" {
536+
logger.V(1).Info("skipping terminal job",
537+
"cluster", job.ClusterName,
538+
"jobId", job.JobId,
539+
"phase", job.Phase)
487540
continue
488541
}
489542

490543
// Skip Failed jobs unless they need retry processing
491544
if job.Phase == "Failed" && job.RetryCount >= job.MaxRetries && !job.CancelRequested {
545+
logger.V(1).Info("skipping failed job that exceeded retries",
546+
"cluster", job.ClusterName,
547+
"jobId", job.JobId,
548+
"retryCount", job.RetryCount,
549+
"maxRetries", job.MaxRetries)
492550
continue
493551
}
494552

@@ -501,23 +559,79 @@ func (r *KrknScenarioRunReconciler) updateClusterJobStatuses(
501559

502560
if err != nil {
503561
if apierrors.IsNotFound(err) {
562+
// IMPORTANT: Don't mark as Failed if pod was just created
563+
// Kubernetes might not have created the pod yet
564+
if job.Phase == "Pending" {
565+
// Calculate time since job start
566+
if job.StartTime != nil {
567+
timeSinceStart := time.Since(job.StartTime.Time)
568+
if timeSinceStart < 30*time.Second {
569+
// Pod not found but job is recent - this is normal, keep waiting
570+
logger.V(1).Info("pod not found but job is recent, keeping Pending status",
571+
"cluster", job.ClusterName,
572+
"jobId", job.JobId,
573+
"podName", job.PodName,
574+
"timeSinceStart", timeSinceStart.String())
575+
continue
576+
}
577+
}
578+
}
579+
580+
// Pod genuinely not found - this is an error
581+
logger.Info("pod not found for job",
582+
"cluster", job.ClusterName,
583+
"jobId", job.JobId,
584+
"podName", job.PodName,
585+
"currentPhase", job.Phase)
586+
504587
job.Phase = "Failed"
505588
job.Message = "Pod not found"
589+
job.FailureReason = "PodNotFound"
506590
now := metav1.Now()
507591
job.CompletionTime = &now
592+
} else {
593+
logger.Error(err, "error fetching pod",
594+
"cluster", job.ClusterName,
595+
"jobId", job.JobId,
596+
"podName", job.PodName)
508597
}
509598
continue
510599
}
511600

601+
logger.V(1).Info("pod found",
602+
"cluster", job.ClusterName,
603+
"jobId", job.JobId,
604+
"podName", job.PodName,
605+
"podPhase", pod.Status.Phase)
606+
512607
// Update job status based on pod phase
608+
previousPhase := job.Phase
513609
switch pod.Status.Phase {
514610
case corev1.PodPending:
515611
job.Phase = "Pending"
612+
if previousPhase != "Pending" {
613+
logger.Info("job phase transition",
614+
"cluster", job.ClusterName,
615+
"jobId", job.JobId,
616+
"from", previousPhase,
617+
"to", "Pending")
618+
}
516619
case corev1.PodRunning:
517620
job.Phase = "Running"
621+
if previousPhase != "Running" {
622+
logger.Info("job phase transition",
623+
"cluster", job.ClusterName,
624+
"jobId", job.JobId,
625+
"from", previousPhase,
626+
"to", "Running")
627+
}
518628
case corev1.PodSucceeded:
519629
job.Phase = "Succeeded"
520630
r.setCompletionTime(job)
631+
logger.Info("job succeeded",
632+
"cluster", job.ClusterName,
633+
"jobId", job.JobId,
634+
"duration", job.CompletionTime.Sub(job.StartTime.Time).String())
521635
case corev1.PodFailed:
522636
job.Phase = "Failed"
523637
job.Message = r.extractPodErrorMessage(&pod)
@@ -598,7 +712,12 @@ func (r *KrknScenarioRunReconciler) updateClusterJobStatuses(
598712
case corev1.PodUnknown:
599713
job.Phase = "Failed"
600714
job.Message = "Pod in unknown state"
715+
job.FailureReason = "PodUnknown"
601716
r.setCompletionTime(job)
717+
logger.Info("pod in unknown state",
718+
"cluster", job.ClusterName,
719+
"jobId", job.JobId,
720+
"podName", job.PodName)
602721
}
603722
}
604723

0 commit comments

Comments
 (0)