@@ -149,9 +149,15 @@ func GetPrimaryPod(ctx context.Context, reader client.Reader, trainJob *trainer.
149149 return nil , fmt .Errorf ("failed to list pods: %w" , err )
150150 }
151151
152- // Return first running and ready pod with IP from this label set
152+ // Return first running/succeeded pod with IP from this label set
153+ // Include Succeeded pods to capture final metrics during preStop hook window
153154 for i := range podList .Items {
154155 pod := & podList .Items [i ]
156+ // For Succeeded pods, skip ready check (they're in preStop window, not "ready" but still accessible)
157+ if pod .Status .Phase == corev1 .PodSucceeded && pod .Status .PodIP != "" {
158+ return pod , nil
159+ }
160+ // For Running pods, require ready check
155161 if pod .Status .Phase == corev1 .PodRunning && pod .Status .PodIP != "" && isPodReady (pod ) {
156162 return pod , nil
157163 }
@@ -186,10 +192,16 @@ func GetPrimaryPod(ctx context.Context, reader client.Reader, trainJob *trainer.
186192 return nil , fmt .Errorf ("no pods found for TrainJob %s/%s" , trainJob .Namespace , trainJob .Name )
187193 }
188194
189- // Return first running and ready pod with IP
195+ // Return first running/succeeded pod with IP
196+ // Include Succeeded pods to capture final metrics during preStop hook window
190197 var podStates []string
191198 for i := range podList .Items {
192199 pod := & podList .Items [i ]
200+ // For Succeeded pods, skip ready check (they're in preStop window, not "ready" but still accessible)
201+ if pod .Status .Phase == corev1 .PodSucceeded && pod .Status .PodIP != "" {
202+ return pod , nil
203+ }
204+ // For Running pods, require ready check
193205 if pod .Status .Phase == corev1 .PodRunning && pod .Status .PodIP != "" && isPodReady (pod ) {
194206 return pod , nil
195207 }
@@ -201,7 +213,7 @@ func GetPrimaryPod(ctx context.Context, reader client.Reader, trainJob *trainer.
201213 podStates = append (podStates , fmt .Sprintf ("%s: %s (IP: %s, %s)" , pod .Name , pod .Status .Phase , pod .Status .PodIP , ready ))
202214 }
203215
204- return nil , fmt .Errorf ("no running and ready pod with IP found for TrainJob %s/%s; found pods: %v" , trainJob .Namespace , trainJob .Name , podStates )
216+ return nil , fmt .Errorf ("no running/succeeded pod with IP found for TrainJob %s/%s; found pods: %v" , trainJob .Namespace , trainJob .Name , podStates )
205217}
206218
207219func GetMetricsPort (trainJob * trainer.TrainJob ) string {
0 commit comments