NVIDIA
diff --git a/‎distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/templates/configmap.yaml‎
Lines changed: 18 additions & 0 deletions b/‎distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/templates/configmap.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/values.yaml‎
Lines changed: 10 additions & 0 deletions b/‎distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/values.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎distros/kubernetes/nvsentinel/charts/node-drainer/templates/configmap.yaml‎
Lines changed: 1 addition & 0 deletions b/‎distros/kubernetes/nvsentinel/charts/node-drainer/templates/configmap.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎distros/kubernetes/nvsentinel/charts/node-drainer/values.yaml‎
Lines changed: 6 additions & 0 deletions b/‎distros/kubernetes/nvsentinel/charts/node-drainer/values.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎distros/kubernetes/nvsentinel/values-full.yaml‎
Lines changed: 6 additions & 0 deletions b/‎distros/kubernetes/nvsentinel/values-full.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/configuration/kubernetes-object-monitor.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/configuration/kubernetes-object-monitor.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/configuration/node-drainer.md‎
Lines changed: 23 additions & 0 deletions b/‎docs/configuration/node-drainer.md‎
Lines changed: 23 additions & 0 deletions
@@ -53,6 +53,24 @@ data:
         {{- if .healthEvent.processingStrategy }}
         processingStrategy = {{ .healthEvent.processingStrategy | quote }}
         {{- end }}
+        {{- if .healthEvent.quarantineOverrides }}
+        [policies.healthEvent.quarantineOverrides]
+          {{- if hasKey .healthEvent.quarantineOverrides "force" }}
+          force = {{ .healthEvent.quarantineOverrides.force }}
+          {{- end }}
+          {{- if hasKey .healthEvent.quarantineOverrides "skip" }}
+          skip = {{ .healthEvent.quarantineOverrides.skip }}
+          {{- end }}
+        {{- end }}
+        {{- if .healthEvent.drainOverrides }}
+        [policies.healthEvent.drainOverrides]
+          {{- if hasKey .healthEvent.drainOverrides "force" }}
+          force = {{ .healthEvent.drainOverrides.force }}
+          {{- end }}
+          {{- if hasKey .healthEvent.drainOverrides "skip" }}
+          skip = {{ .healthEvent.drainOverrides.skip }}
+          {{- end }}
+        {{- end }}
     
     {{- end }}
 
@@ -50,6 +50,14 @@ policies:
       recommendedAction: CONTACT_SUPPORT
       errorCode:
         - NODE_NOT_READY
+      # Optional behavior overrides for this policy's generated HealthEvents.
+      # Set either force or skip, never both in the same override block.
+      # quarantineOverrides:
+      #   force: true  # Force node cordon even if normal quarantine rules would not.
+      #   skip: true   # Skip node cordon for this health event.
+      # drainOverrides:
+      #   force: true  # Force immediate pod eviction regardless of namespace drain mode.
+      #   skip: true   # Skip pod eviction and mark the event as already drained.
 
   # Example: Monitor a custom resource (e.g., a GPU Job)
   # Uncomment and modify to monitor your own custom resources
@@ -89,6 +97,8 @@ policies:
   #     recommendedAction: CONTACT_SUPPORT
   #     errorCode:
   #       - GPU_JOB_FAILED
+  #     drainOverrides:
+  #       skip: true
 
 resources:
   requests:
 
@@ -25,6 +25,7 @@ data:
     systemNamespaces = {{ .Values.systemNamespaces | quote }}
     deleteAfterTimeoutMinutes = {{ .Values.deleteAfterTimeoutMinutes }}
     notReadyTimeoutMinutes = {{ .Values.notReadyTimeoutMinutes }}
+    drainGPUPods = {{ .Values.drainGPUPods }}
     partialDrainEnabled = {{ .Values.partialDrainEnabled }}
     
     {{- range .Values.userNamespaces }}
 
@@ -55,6 +55,12 @@ deleteAfterTimeoutMinutes: 60
 # Default: 5 minutes if not specified (validated in config.go)
 notReadyTimeoutMinutes: 5
 
+# Flag to restrict draining to GPU workloads
+# If enabled, only pods with the metadata-collector device annotation
+# (indicating assigned GPU devices) are eligible for draining
+# Default: false if not specified
+drainGPUPods: false
+
 # User namespace configuration with eviction modes
 # Defines how pods in different namespaces should be evicted during node drain
 # Each entry specifies a namespace pattern and its corresponding eviction mode
 
@@ -590,6 +590,12 @@ node-drainer:
   # Default: 5 minutes
   notReadyTimeoutMinutes: 5
 
+  # Flag to restrict draining to GPU workloads
+  # If enabled, only pods with the metadata-collector device annotation
+  # (indicating assigned GPU devices) are eligible for draining
+  # Default: false if not specified
+  drainGPUPods: false
+
   # Namespace-specific eviction strategies
   # Define how pods in different namespaces should be evicted
   # Multiple rules can be defined with namespace patterns
 
@@ -83,6 +83,10 @@ kubernetes-object-monitor:
         recommendedAction: CONTACT_SUPPORT
         errorCode:
           - ERROR_CODE
+        quarantineOverrides:
+          force: true  # Or use skip: true; do not set both
+        drainOverrides:
+          skip: true   # Or use force: true; do not set both
 ```
 
 ### Parameters
@@ -135,6 +139,12 @@ Action code from health event proto (see [health_event.proto](https://github.com
 ##### errorCode
 Array of error code strings for categorization and filtering.
 
+##### quarantineOverrides
+Optional behavior override for fault-quarantine. `force` forces node cordoning regardless of normal rules; `skip` skips node cordoning for the generated health event. Set at most one of `force` or `skip`.
+
+##### drainOverrides
+Optional behavior override for node-drainer. `force` forces immediate pod eviction regardless of configured namespace drain modes; `skip` skips pod eviction and marks the event as already drained. Set at most one of `force` or `skip`.
+
 ## CEL Expressions
 
 ### Predicate Expressions
 
@@ -96,6 +96,29 @@ node-drainer:
 
 When a pod has been in NotReady state for longer than this timeout, it is excluded from the list of pods to evict. This prevents attempting to evict pods that are already unhealthy and unlikely to respond to eviction requests.
 
+### GPU-Only Draining
+
+If enabled, the node-drainer filters pod eviction to only target workloads that request GPU resources.
+
+```yaml
+node-drainer:
+  drainGPUPods: false
+```
+
+The node-drainer detects GPU resource requests through device annotations added to pods by the metadata-collector. Pods with device annotations are identified as GPU workloads and eligible for eviction.
+
+Device annotations are added to pods requesting GPU resources by metadata-collector with the format:
+```yaml
+annotations:
+	  dgxc.nvidia.com/devices: '{"devices":{"nvidia.com/gpu":["GPU-123"]}}'
+```
+
+#### Behavior
+
+- **When enabled (`true`)**: Only pods with GPU device annotations are evicted during drain operations
+- **When disabled (`false`)**: All eligible pods in configured namespaces are evicted (default behavior)
+- Pods without GPU requests are preserved, maintaining critical infrastructure services
+
 ## User Namespaces
 
 Defines eviction behavior for user workloads based on namespace patterns.