Skip to content

Commit bae66e7

Browse files
fix: Accept Succeeded pods for final metrics polling in progression tracking
Signed-off-by: abhijeet-dhumal <abhijeetdhumal652@gmail.com>
1 parent cf3112e commit bae66e7

1 file changed

Lines changed: 15 additions & 3 deletions

File tree

pkg/rhai/progression/progression.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,15 @@ func GetPrimaryPod(ctx context.Context, reader client.Reader, trainJob *trainer.
149149
return nil, fmt.Errorf("failed to list pods: %w", err)
150150
}
151151

152-
// Return first running and ready pod with IP from this label set
152+
// Return first running/succeeded pod with IP from this label set
153+
// Include Succeeded pods to capture final metrics during preStop hook window
153154
for i := range podList.Items {
154155
pod := &podList.Items[i]
156+
// For Succeeded pods, skip ready check (they're in preStop window, not "ready" but still accessible)
157+
if pod.Status.Phase == corev1.PodSucceeded && pod.Status.PodIP != "" {
158+
return pod, nil
159+
}
160+
// For Running pods, require ready check
155161
if pod.Status.Phase == corev1.PodRunning && pod.Status.PodIP != "" && isPodReady(pod) {
156162
return pod, nil
157163
}
@@ -186,10 +192,16 @@ func GetPrimaryPod(ctx context.Context, reader client.Reader, trainJob *trainer.
186192
return nil, fmt.Errorf("no pods found for TrainJob %s/%s", trainJob.Namespace, trainJob.Name)
187193
}
188194

189-
// Return first running and ready pod with IP
195+
// Return first running/succeeded pod with IP
196+
// Include Succeeded pods to capture final metrics during preStop hook window
190197
var podStates []string
191198
for i := range podList.Items {
192199
pod := &podList.Items[i]
200+
// For Succeeded pods, skip ready check (they're in preStop window, not "ready" but still accessible)
201+
if pod.Status.Phase == corev1.PodSucceeded && pod.Status.PodIP != "" {
202+
return pod, nil
203+
}
204+
// For Running pods, require ready check
193205
if pod.Status.Phase == corev1.PodRunning && pod.Status.PodIP != "" && isPodReady(pod) {
194206
return pod, nil
195207
}
@@ -201,7 +213,7 @@ func GetPrimaryPod(ctx context.Context, reader client.Reader, trainJob *trainer.
201213
podStates = append(podStates, fmt.Sprintf("%s: %s (IP: %s, %s)", pod.Name, pod.Status.Phase, pod.Status.PodIP, ready))
202214
}
203215

204-
return nil, fmt.Errorf("no running and ready pod with IP found for TrainJob %s/%s; found pods: %v", trainJob.Namespace, trainJob.Name, podStates)
216+
return nil, fmt.Errorf("no running/succeeded pod with IP found for TrainJob %s/%s; found pods: %v", trainJob.Namespace, trainJob.Name, podStates)
205217
}
206218

207219
func GetMetricsPort(trainJob *trainer.TrainJob) string {

0 commit comments

Comments
 (0)