@@ -67,7 +67,14 @@ func TestReconcile_FullDrainCycle(t *testing.T) {
6767
6868 assertNodeAnnotation (t , tc , node .Name , "[J] [NVSentinel] 79 GPU:0 - GPU has fallen off the bus" )
6969
70- markPodDrainReady (t , tc , pod .Name , pod .Namespace )
70+ // Simulate DRAINING state: Slurm accepted drain but jobs still running.
71+ markPodDraining (t , tc , pod .Name , pod .Namespace )
72+
73+ // Pod must NOT be deleted while in DRAINING state (busy).
74+ assertPodNotDeleted (t , tc , pod .Name , pod .Namespace , 3 * time .Second )
75+
76+ // Simulate transition to DRAINED state: jobs finished, node is idle.
77+ markPodDrained (t , tc , pod .Name , pod .Namespace )
7178
7279 waitForDrainComplete (t , tc , "drain-full-cycle" , "default" )
7380 waitForPodDeletion (t , tc , pod .Name , pod .Namespace )
@@ -97,6 +104,30 @@ func TestReconcile_PreExistingAnnotationPreserved(t *testing.T) {
97104 assertNodeAnnotation (t , tc , node .Name , "Manual drain by operator" )
98105}
99106
107+ func TestReconcile_DrainingPodNotDeleted (t * testing.T ) {
108+ tc := setupTestEnv (t , "drain-still-draining" )
109+
110+ node := createNode (t , tc , "test-node-draining" , nil , map [string ]string {
111+ nvsentinelStateLabelKey : "draining" ,
112+ })
113+ pod := createSlinkyPod (t , tc , node .Name )
114+ markPodReady (t , tc , pod .Name , pod .Namespace )
115+ createDrainRequest (t , tc , "drain-still-draining" , drainv1alpha1.DrainRequestSpec {
116+ NodeName : node .Name ,
117+ ErrorCode : []string {"79" },
118+ Reason : "GPU has fallen off the bus" ,
119+ })
120+
121+ assertNodeAnnotation (t , tc , node .Name , "[J] [NVSentinel] 79 - GPU has fallen off the bus" )
122+
123+ // Set pod to DRAINING: Drain flag set but node is still busy (Allocated).
124+ markPodDraining (t , tc , pod .Name , pod .Namespace )
125+
126+ // Verify pod is NOT deleted and DrainRequest is NOT completed while draining.
127+ assertPodNotDeleted (t , tc , pod .Name , pod .Namespace , 5 * time .Second )
128+ assertDrainNotComplete (t , tc , "drain-still-draining" , "default" )
129+ }
130+
100131// ---------------------------------------------------------------------------
101132// Test setup
102133// ---------------------------------------------------------------------------
@@ -235,24 +266,43 @@ func createFailedPod(t *testing.T, tc *testEnvContext, nodeName string) {
235266func markPodReady (t * testing.T , tc * testEnvContext , podName , podNamespace string ) {
236267 t .Helper ()
237268
269+ var pod corev1.Pod
270+
271+ require .Eventually (t , func () bool {
272+ return tc .client .Get (tc .ctx , types.NamespacedName {Name : podName , Namespace : podNamespace }, & pod ) == nil
273+ }, testTimeout , testPollInterval , "Pod %s/%s should exist" , podNamespace , podName )
274+
275+ pod .Status .Phase = corev1 .PodRunning
276+ pod .Status .Conditions = []corev1.PodCondition {
277+ {Type : corev1 .PodReady , Status : corev1 .ConditionTrue },
278+ }
279+ require .NoError (t , tc .client .Status ().Update (tc .ctx , & pod ))
280+ }
281+
282+ func markPodDraining (t * testing.T , tc * testEnvContext , podName , podNamespace string ) {
283+ t .Helper ()
284+
238285 pod := & corev1.Pod {}
239286 require .NoError (t , tc .client .Get (tc .ctx , types.NamespacedName {Name : podName , Namespace : podNamespace }, pod ))
240287
241- pod .Status .Phase = corev1 .PodRunning
242288 pod .Status .Conditions = []corev1.PodCondition {
243289 {Type : corev1 .PodReady , Status : corev1 .ConditionTrue },
290+ {Type : slurmNodeStateDrainConditionType , Status : corev1 .ConditionTrue },
291+ {Type : slurmNodeStateAllocatedConditionType , Status : corev1 .ConditionTrue },
244292 }
245293 require .NoError (t , tc .client .Status ().Update (tc .ctx , pod ))
246294}
247295
248- func markPodDrainReady (t * testing.T , tc * testEnvContext , podName , podNamespace string ) {
296+ func markPodDrained (t * testing.T , tc * testEnvContext , podName , podNamespace string ) {
249297 t .Helper ()
250298
251299 pod := & corev1.Pod {}
252300 require .NoError (t , tc .client .Get (tc .ctx , types.NamespacedName {Name : podName , Namespace : podNamespace }, pod ))
253301
254302 pod .Status .Conditions = []corev1.PodCondition {
303+ {Type : corev1 .PodReady , Status : corev1 .ConditionTrue },
255304 {Type : slurmNodeStateDrainConditionType , Status : corev1 .ConditionTrue },
305+ {Type : "SlurmNodeStateIdle" , Status : corev1 .ConditionTrue },
256306 }
257307 require .NoError (t , tc .client .Status ().Update (tc .ctx , pod ))
258308}
@@ -328,6 +378,32 @@ func waitForAnnotationRemoved(t *testing.T, tc *testEnvContext, nodeName string)
328378 }, testTimeout , testPollInterval , "Annotation on node %s should be removed" , nodeName )
329379}
330380
381+ func assertPodNotDeleted (t * testing.T , tc * testEnvContext , podName , podNamespace string , waitDuration time.Duration ) {
382+ t .Helper ()
383+
384+ assert .Never (t , func () bool {
385+ p := & corev1.Pod {}
386+ if err := tc .client .Get (tc .ctx , types.NamespacedName {Name : podName , Namespace : podNamespace }, p ); err != nil {
387+ return apierrors .IsNotFound (err )
388+ }
389+
390+ return p .DeletionTimestamp != nil
391+ }, waitDuration , testPollInterval , "Pod %s/%s should NOT be deleted while draining" , podNamespace , podName )
392+ }
393+
394+ func assertDrainNotComplete (t * testing.T , tc * testEnvContext , drName , drNamespace string ) {
395+ t .Helper ()
396+
397+ dr := & drainv1alpha1.DrainRequest {}
398+ require .NoError (t , tc .client .Get (tc .ctx , types.NamespacedName {Name : drName , Namespace : drNamespace }, dr ))
399+
400+ for _ , c := range dr .Status .Conditions {
401+ if c .Type == drainCompleteConditionType && c .Status == metav1 .ConditionTrue {
402+ t .Fatalf ("DrainRequest %s/%s should NOT have DrainComplete=True while pods are still draining" , drNamespace , drName )
403+ }
404+ }
405+ }
406+
331407func assertNodeAnnotation (t * testing.T , tc * testEnvContext , nodeName , expectedValue string ) {
332408 t .Helper ()
333409
0 commit comments