Skip to content

Commit b25b289

Browse files
authored
Merge branch 'main' into xid-analyzer-recovery-actions
2 parents c739d07 + 55ce76d commit b25b289

10 files changed

Lines changed: 199 additions & 11 deletions

File tree

distros/kubernetes/nvsentinel/charts/node-drainer/templates/configmap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ data:
2525
systemNamespaces = {{ .Values.systemNamespaces | quote }}
2626
deleteAfterTimeoutMinutes = {{ .Values.deleteAfterTimeoutMinutes }}
2727
notReadyTimeoutMinutes = {{ .Values.notReadyTimeoutMinutes }}
28+
drainGPUPods = {{ .Values.drainGPUPods }}
2829
partialDrainEnabled = {{ .Values.partialDrainEnabled }}
2930
3031
{{- range .Values.userNamespaces }}

distros/kubernetes/nvsentinel/charts/node-drainer/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ deleteAfterTimeoutMinutes: 60
5555
# Default: 5 minutes if not specified (validated in config.go)
5656
notReadyTimeoutMinutes: 5
5757

58+
# Flag to restrict draining to GPU workloads
59+
# If enabled, only pods with the metadata-collector device annotation
60+
# (indicating assigned GPU devices) are eligible for draining
61+
# Default: false if not specified
62+
drainGPUPods: false
63+
5864
# User namespace configuration with eviction modes
5965
# Defines how pods in different namespaces should be evicted during node drain
6066
# Each entry specifies a namespace pattern and its corresponding eviction mode

distros/kubernetes/nvsentinel/values-full.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,12 @@ node-drainer:
590590
# Default: 5 minutes
591591
notReadyTimeoutMinutes: 5
592592

593+
# Flag to restrict draining to GPU workloads
594+
# If enabled, only pods with the metadata-collector device annotation
595+
# (indicating assigned GPU devices) are eligible for draining
596+
# Default: false if not specified
597+
drainGPUPods: false
598+
593599
# Namespace-specific eviction strategies
594600
# Define how pods in different namespaces should be evicted
595601
# Multiple rules can be defined with namespace patterns

docs/configuration/node-drainer.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,29 @@ node-drainer:
9696

9797
When a pod has been in NotReady state for longer than this timeout, it is excluded from the list of pods to evict. This prevents attempting to evict pods that are already unhealthy and unlikely to respond to eviction requests.
9898

99+
### GPU-Only Draining
100+
101+
If enabled, the node-drainer filters pod eviction to only target workloads that request GPU resources.
102+
103+
```yaml
104+
node-drainer:
105+
drainGPUPods: false
106+
```
107+
108+
The node-drainer detects GPU resource requests through device annotations added to pods by the metadata-collector. Pods with device annotations are identified as GPU workloads and eligible for eviction.
109+
110+
Device annotations are added to pods requesting GPU resources by metadata-collector with the format:
111+
```yaml
112+
annotations:
113+
dgxc.nvidia.com/devices: '{"devices":{"nvidia.com/gpu":["GPU-123"]}}'
114+
```
115+
116+
#### Behavior
117+
118+
- **When enabled (`true`)**: Only pods with GPU device annotations are evicted during drain operations
119+
- **When disabled (`false`)**: All eligible pods in configured namespaces are evicted (default behavior)
120+
- Pods without GPU requests are preserved, maintaining critical infrastructure services
121+
99122
## User Namespaces
100123

101124
Defines eviction behavior for user workloads based on namespace patterns.

docs/node-drainer.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ node-drainer:
4949
systemNamespaces: "^(nvsentinel|kube-system|gpu-operator)$" # Namespaces to skip
5050
deleteAfterTimeoutMinutes: 60 # Force delete after this timeout
5151
notReadyTimeoutMinutes: 5 # Timeout for stuck pods
52+
drainGPUPods: false # Only drain pods requesting GPU
5253

5354
userNamespaces:
5455
- name: "*" # Pattern matching namespaces
@@ -81,6 +82,7 @@ The module supports three eviction modes for different workload types:
8182
- **System Namespaces**: Regex pattern for namespaces to skip (system pods)
8283
- **Delete Timeout**: Minutes to wait before force deleting pods
8384
- **Not Ready Timeout**: Minutes before considering a pod stuck
85+
- **Drain GPU Pods**: When enabled, only drains pods requesting GPU resources; CPU-only workloads remain on the node
8486
- **User Namespaces**: Define eviction mode per namespace pattern (supports `*` wildcard)
8587
- **Partial Drain**: Enable or disable partial drain functionality
8688

@@ -108,4 +110,7 @@ Multiple timeout mechanisms prevent stuck drains:
108110
Automatically resumes drain operations after restarts - queries datastore for in-progress drains and continues from where it left off.
109111

110112
### Partial Drain Functionality
111-
For GPU faults that can be remediated with a GPU reset, the Node Drainer will only drain pods which are leveraging the unhealthy GPU. For GPU faults that require a node reboot, all pods on the given node in the configured namespaces will be drained.
113+
For GPU faults that can be remediated with a GPU reset, the Node Drainer will only drain pods which are leveraging the unhealthy GPU. For GPU faults that require a node reboot, all pods on the given node in the configured namespaces will be drained.
114+
115+
### GPU-Only Draining
116+
When `drainGPUPods: true` is set, the Node Drainer filters pod eviction to only target workloads that request GPU resources. The feature detects GPU resources using device annotations provided by the Metadata Collector, which tracks GPU allocation across the cluster. Default is `false`.

node-drainer/pkg/config/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ type TomlConfig struct {
6262
DeleteAfterTimeoutMinutes int `toml:"deleteAfterTimeoutMinutes"`
6363
// NotReadyTimeoutMinutes is the time after which a pod in NotReady state is considered stuck
6464
NotReadyTimeoutMinutes int `toml:"notReadyTimeoutMinutes"`
65+
DrainGPUPods bool `toml:"drainGPUPods"`
6566
UserNamespaces []UserNamespace `toml:"userNamespaces"`
6667
CustomDrain CustomDrainConfig `toml:"customDrain"`
6768
PartialDrainEnabled bool `toml:"partialDrainEnabled"`

node-drainer/pkg/evaluator/evaluator_integration_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,12 @@ func setupDirectTest(t *testing.T, userNamespaces []config.UserNamespace, dryRun
103103
SystemNamespaces: "kube-*",
104104
DeleteAfterTimeoutMinutes: 5,
105105
NotReadyTimeoutMinutes: 2,
106+
DrainGPUPods: false,
106107
UserNamespaces: userNamespaces,
107108
PartialDrainEnabled: partialDrainEnabled,
108109
}
109110

110-
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), dryRun)
111+
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), false, dryRun)
111112
require.NoError(t, err)
112113
go func() { _ = informersInstance.Run(ctx) }()
113114
require.Eventually(t, informersInstance.HasSynced, 30*time.Second, 1*time.Second)

node-drainer/pkg/informers/informers.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,13 @@ type Informers struct {
5353
nodeInformer cache.SharedIndexInformer
5454
clientset kubernetes.Interface
5555
notReadyTimeoutMinutes *int
56+
drainGPUPods bool
5657
dryRunMode []string
5758
namespace string
5859
}
5960

6061
func NewInformers(clientset kubernetes.Interface, resyncPeriod time.Duration,
61-
notReadyTimeoutMinutes *int, dryRun bool) (*Informers, error) {
62+
notReadyTimeoutMinutes *int, drainGPUPods bool, dryRun bool) (*Informers, error) {
6263
informerFactory := informers.NewSharedInformerFactoryWithOptions(
6364
clientset,
6465
resyncPeriod,
@@ -104,6 +105,7 @@ func NewInformers(clientset kubernetes.Interface, resyncPeriod time.Duration,
104105
eventInformer: eventInformer,
105106
nodeInformer: nodeInformer,
106107
notReadyTimeoutMinutes: notReadyTimeoutMinutes,
108+
drainGPUPods: drainGPUPods,
107109
dryRunMode: dryRunMode,
108110
namespace: metav1.NamespaceDefault,
109111
}, nil
@@ -189,6 +191,10 @@ func (i *Informers) FindEvictablePodsInNamespaceAndNode(namespace, nodeName stri
189191

190192
pods = i.filterEvictablePods(pods)
191193

194+
if i.drainGPUPods && partialDrainEntity == nil {
195+
pods = i.filterPodsWithGPURequests(pods)
196+
}
197+
192198
pods, err = i.filterPodsUsingEntity(pods, partialDrainEntity, nodeName)
193199
if err != nil {
194200
return nil, fmt.Errorf("failed to filter pods using entity: %w", err)
@@ -353,6 +359,26 @@ func (i *Informers) filterEvictablePods(pods []*v1.Pod) []*v1.Pod {
353359
return filteredPods
354360
}
355361

362+
func (i *Informers) filterPodsWithGPURequests(pods []*v1.Pod) []*v1.Pod {
363+
filteredPods := []*v1.Pod{}
364+
365+
for _, pod := range pods {
366+
_, podHasDeviceAnnotation := pod.Annotations[model.PodDeviceAnnotationName]
367+
368+
// If the pod has been assigned GPU it must have device annotation
369+
if podHasDeviceAnnotation {
370+
slog.Info("Pod is eligible for draining as it is requesting GPU",
371+
"pod", pod.Name,
372+
"namespace", pod.Namespace,
373+
"node", pod.Spec.NodeName,
374+
)
375+
filteredPods = append(filteredPods, pod)
376+
}
377+
}
378+
379+
return filteredPods
380+
}
381+
356382
func (i *Informers) isDaemonSetPod(pod *v1.Pod) bool {
357383
for _, owner := range pod.OwnerReferences {
358384
if owner.Kind == "DaemonSet" {

node-drainer/pkg/initializer/init.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ func InitializeAll(ctx context.Context, params InitializationParams) (*Component
9898
return nil, fmt.Errorf("failed to initialize dynamic client and mapper: %w", err)
9999
}
100100

101-
informersInstance, err := initializeInformers(clientSet, &configs.tomlCfg.NotReadyTimeoutMinutes, params.DryRun)
101+
informersInstance, err := initializeInformers(
102+
clientSet, &configs.tomlCfg.NotReadyTimeoutMinutes, configs.tomlCfg.DrainGPUPods, params.DryRun,
103+
)
102104
if err != nil {
103105
return nil, fmt.Errorf("error while initializing informers: %w", err)
104106
}
@@ -286,8 +288,8 @@ func initializeKubernetesClient(kubeconfigPath string) (kubernetes.Interface, *r
286288
}
287289

288290
func initializeInformers(clientset kubernetes.Interface,
289-
notReadyTimeoutMinutes *int, dryRun bool) (*informers.Informers, error) {
290-
return informers.NewInformers(clientset, time.Hour, notReadyTimeoutMinutes, dryRun)
291+
notReadyTimeoutMinutes *int, drainGPUPods bool, dryRun bool) (*informers.Informers, error) {
292+
return informers.NewInformers(clientset, time.Hour, notReadyTimeoutMinutes, drainGPUPods, dryRun)
291293
}
292294

293295
func initializeStateManager(clientSet kubernetes.Interface) statemanager.StateManager {

node-drainer/pkg/reconciler/reconciler_integration_test.go

Lines changed: 122 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ func TestReconciler_ProcessEvent(t *testing.T) {
229229
expectError bool
230230
expectedNodeLabel *string
231231
numReconciles int
232+
enableDrainGPUPods bool
232233
validateFunc func(t *testing.T, client kubernetes.Interface, ctx context.Context, nodeName string, err error)
233234
}{
234235
{
@@ -260,6 +261,114 @@ func TestReconciler_ProcessEvent(t *testing.T) {
260261
}, 30*time.Second, 1*time.Second, "pods should be evicted")
261262
},
262263
},
264+
{
265+
name: "DrainGPUPods enabled only evicts GPU workloads",
266+
nodeName: "gpu-drain-node",
267+
namespaces: []string{"immediate-test"},
268+
nodeQuarantined: model.Quarantined,
269+
enableDrainGPUPods: true,
270+
existingNodeLabels: map[string]string{
271+
statemanager.NVSentinelStateLabelKey: string(statemanager.QuarantinedLabelValue),
272+
},
273+
pods: []*v1.Pod{
274+
{
275+
ObjectMeta: metav1.ObjectMeta{
276+
Name: "gpu-pod-1",
277+
Namespace: "immediate-test",
278+
Annotations: map[string]string{
279+
model.PodDeviceAnnotationName: "{\"devices\":{\"nvidia.com/gpu\":[\"GPU-123\"]}}",
280+
},
281+
},
282+
Spec: v1.PodSpec{
283+
NodeName: "gpu-drain-node",
284+
Containers: []v1.Container{{
285+
Name: "gpu-container",
286+
Image: "nvidia/cuda:latest",
287+
Resources: v1.ResourceRequirements{
288+
Limits: v1.ResourceList{
289+
v1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"),
290+
},
291+
},
292+
}},
293+
},
294+
Status: v1.PodStatus{Phase: v1.PodRunning},
295+
},
296+
{
297+
ObjectMeta: metav1.ObjectMeta{
298+
Name: "cpu-pod-1",
299+
Namespace: "immediate-test",
300+
},
301+
Spec: v1.PodSpec{
302+
NodeName: "gpu-drain-node",
303+
Containers: []v1.Container{{
304+
Name: "cpu-container",
305+
Image: "nginx",
306+
}},
307+
},
308+
Status: v1.PodStatus{Phase: v1.PodRunning},
309+
},
310+
{
311+
ObjectMeta: metav1.ObjectMeta{
312+
Name: "gpu-pod-2",
313+
Namespace: "immediate-test",
314+
Annotations: map[string]string{
315+
model.PodDeviceAnnotationName: "{\"devices\":{\"nvidia.com/gpu\":[\"GPU-456\"]}}",
316+
},
317+
},
318+
Spec: v1.PodSpec{
319+
NodeName: "gpu-drain-node",
320+
Containers: []v1.Container{{
321+
Name: "gpu-container",
322+
Image: "pytorch:latest",
323+
Resources: v1.ResourceRequirements{
324+
Limits: v1.ResourceList{
325+
v1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"),
326+
},
327+
},
328+
}},
329+
},
330+
Status: v1.PodStatus{Phase: v1.PodRunning},
331+
},
332+
},
333+
expectError: true,
334+
expectedNodeLabel: ptr.To(string(statemanager.DrainingLabelValue)),
335+
validateFunc: func(t *testing.T, client kubernetes.Interface, ctx context.Context, nodeName string, err error) {
336+
assert.Error(t, err)
337+
assert.Contains(t, err.Error(), "immediate eviction completed, requeuing for status verification")
338+
339+
// Verify GPU pods are evicted but CPU pod is preserved
340+
expectDeletedForPods := map[string]bool{
341+
"gpu-pod-1": true, // Has GPU-123, eligible to be evicted
342+
"cpu-pod-1": false, // preserved
343+
"gpu-pod-2": true, // Has GPU-456, eligible to be evicted
344+
}
345+
346+
for podName, expectDeleted := range expectDeletedForPods {
347+
pod, err := client.CoreV1().Pods("immediate-test").Get(ctx, podName, metav1.GetOptions{})
348+
require.NoError(t, err, "Pod %s should exist", podName)
349+
350+
if expectDeleted {
351+
assert.NotNil(t, pod.DeletionTimestamp, "GPU pod %s should be marked for deletion", podName)
352+
} else {
353+
assert.Nil(t, pod.DeletionTimestamp, "CPU pod %s should NOT be marked for deletion", podName)
354+
}
355+
}
356+
357+
// Verify that eventually only CPU pod remains (GPU pods evicted)
358+
require.Eventually(t, func() bool {
359+
pods, _ := client.CoreV1().Pods("immediate-test").List(ctx, metav1.ListOptions{})
360+
remainingPods := 0
361+
for _, pod := range pods.Items {
362+
if pod.DeletionTimestamp == nil {
363+
remainingPods++
364+
// Verify remaining pod is CPU pod
365+
assert.Equal(t, "cpu-pod-1", pod.Name, "Only CPU pod should remain")
366+
}
367+
}
368+
return remainingPods == 1
369+
}, 30*time.Second, 1*time.Second, "only CPU pod should remain after GPU pods evicted")
370+
},
371+
},
263372
{
264373
name: "DrainOverrides.Force overrides all namespace modes",
265374
nodeName: "force-node",
@@ -737,7 +846,7 @@ func TestReconciler_ProcessEvent(t *testing.T) {
737846
{Name: "immediate-*", Mode: config.ModeImmediateEvict},
738847
{Name: "completion-*", Mode: config.ModeAllowCompletion},
739848
{Name: "timeout-*", Mode: config.ModeDeleteAfterTimeout},
740-
}, false)
849+
}, false, tt.enableDrainGPUPods)
741850

742851
nodeLabels := tt.existingNodeLabels
743852
if nodeLabels == nil {
@@ -1198,7 +1307,7 @@ type testSetup struct {
11981307
mockDB *mockDataStore
11991308
}
12001309

1201-
func setupDirectTest(t *testing.T, userNamespaces []config.UserNamespace, dryRun bool) *testSetup {
1310+
func setupDirectTest(t *testing.T, userNamespaces []config.UserNamespace, dryRun bool, drainGPUPods ...bool) *testSetup {
12021311
t.Helper()
12031312
ctx := t.Context()
12041313

@@ -1210,11 +1319,17 @@ func setupDirectTest(t *testing.T, userNamespaces []config.UserNamespace, dryRun
12101319
client, err := kubernetes.NewForConfig(cfg)
12111320
require.NoError(t, err)
12121321

1322+
enableDrainGPUPods := false
1323+
if len(drainGPUPods) > 0 {
1324+
enableDrainGPUPods = drainGPUPods[0]
1325+
}
1326+
12131327
tomlConfig := config.TomlConfig{
12141328
EvictionTimeoutInSeconds: config.Duration{Duration: 30 * time.Second},
12151329
SystemNamespaces: "kube-*",
12161330
DeleteAfterTimeoutMinutes: 5,
12171331
NotReadyTimeoutMinutes: 2,
1332+
DrainGPUPods: enableDrainGPUPods,
12181333
UserNamespaces: userNamespaces,
12191334
PartialDrainEnabled: true,
12201335
}
@@ -1237,7 +1352,7 @@ func setupDirectTest(t *testing.T, userNamespaces []config.UserNamespace, dryRun
12371352
StateManager: statemanager.NewStateManager(client),
12381353
}
12391354

1240-
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), dryRun)
1355+
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), enableDrainGPUPods, dryRun)
12411356
require.NoError(t, err)
12421357

12431358
go func() { _ = informersInstance.Run(ctx) }()
@@ -1311,6 +1426,7 @@ func setupCustomDrainTest(t *testing.T, customDrainConfig config.CustomDrainConf
13111426
SystemNamespaces: "kube-*",
13121427
DeleteAfterTimeoutMinutes: 5,
13131428
NotReadyTimeoutMinutes: 2,
1429+
DrainGPUPods: false,
13141430
CustomDrain: customDrainConfig,
13151431
PartialDrainEnabled: true,
13161432
}
@@ -1332,7 +1448,7 @@ func setupCustomDrainTest(t *testing.T, customDrainConfig config.CustomDrainConf
13321448
StateManager: statemanager.NewStateManager(client),
13331449
}
13341450

1335-
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), false)
1451+
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), false, false)
13361452
require.NoError(t, err)
13371453

13381454
go func() { _ = informersInstance.Run(ctx) }()
@@ -2041,6 +2157,7 @@ func TestReconciler_CustomDrainCRDNotFound(t *testing.T) {
20412157
SystemNamespaces: "kube-*",
20422158
DeleteAfterTimeoutMinutes: 5,
20432159
NotReadyTimeoutMinutes: 2,
2160+
DrainGPUPods: false,
20442161
CustomDrain: customDrainCfg,
20452162
UserNamespaces: []config.UserNamespace{
20462163
{Name: "*", Mode: config.ModeImmediateEvict},
@@ -2065,7 +2182,7 @@ func TestReconciler_CustomDrainCRDNotFound(t *testing.T) {
20652182
StateManager: statemanager.NewStateManager(client),
20662183
}
20672184

2068-
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), false)
2185+
informersInstance, err := informers.NewInformers(client, 1*time.Minute, ptr.To(2), false, false)
20692186
require.NoError(t, err)
20702187

20712188
go func() { _ = informersInstance.Run(ctx) }()

0 commit comments

Comments
 (0)