From 7a7bdb0d292da8309751c90adf08855e475970f7 Mon Sep 17 00:00:00 2001 From: Tsubasa Watanabe Date: Thu, 5 Mar 2026 18:28:44 +0900 Subject: [PATCH] feat: Device Binding Conditions for ComputeDomain Added a feature gate for ComputeDomainBindingConditions. When this feature gate is enabled, the following functionality is activated. - Publish BindingConditions on channel devices in ComputeDomain ResourceSlices - Add PodManager that uses fieldSelector on status.nominatedNodeName to efficiently monitor Pods with ResourceClaims containing BindingConditions. - Set BindingConditions when ComputeDomain node become Ready Signed-off-by: Tsubasa Watanabe --- .../resource/v1beta1/computedomain.go | 6 +- cmd/compute-domain-controller/cdstatus.go | 27 ++ .../computedomain.go | 41 +- .../deviceinfo.go | 6 + cmd/compute-domain-kubelet-plugin/main.go | 8 +- cmd/compute-domain-kubelet-plugin/pod.go | 359 ++++++++++++++++++ .../resource.nvidia.com_computedomains.yaml | 1 + .../templates/rbac-kubeletplugin.yaml | 3 + pkg/featuregates/featuregates.go | 11 + 9 files changed, 454 insertions(+), 8 deletions(-) create mode 100644 cmd/compute-domain-kubelet-plugin/pod.go diff --git a/api/nvidia.com/resource/v1beta1/computedomain.go b/api/nvidia.com/resource/v1beta1/computedomain.go index d373befef..095209fb2 100644 --- a/api/nvidia.com/resource/v1beta1/computedomain.go +++ b/api/nvidia.com/resource/v1beta1/computedomain.go @@ -24,9 +24,13 @@ const ( ComputeDomainStatusNone = "" ComputeDomainStatusReady = "Ready" ComputeDomainStatusNotReady = "NotReady" + ComputeDomainStatusFailed = "Failed" ComputeDomainChannelAllocationModeSingle = "Single" ComputeDomainChannelAllocationModeAll = "All" + + ComputeDomainBindingConditions = "ComputeDomainReady" + ComputeDomainBindingFailureConditions = "ComputeDomainNotReady" ) // +genclient @@ -134,7 +138,7 @@ type ComputeDomainNode struct { // it is not. It is marked as optional in order to support downgrades // and avoid an API bump. // +kubebuilder:validation:Optional - // +kubebuilder:validation:Enum=Ready;NotReady + // +kubebuilder:validation:Enum=Ready;NotReady;Failed // +kubebuilder:default:=NotReady Status string `json:"status,omitempty"` } diff --git a/cmd/compute-domain-controller/cdstatus.go b/cmd/compute-domain-controller/cdstatus.go index 464246f28..a44a216fb 100644 --- a/cmd/compute-domain-controller/cdstatus.go +++ b/cmd/compute-domain-controller/cdstatus.go @@ -181,6 +181,13 @@ func (m *ComputeDomainStatusManager) sync(ctx context.Context) { continue } + if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) { + if IsPodFailed(pod) { + nonFabricPodsByCD[cdUID] = append(nonFabricPodsByCD[cdUID], pod) + continue + } + } + // Separate pods based on cliqueID label cliqueID, exists := pod.Labels[computeDomainCliqueLabelKey] if !exists || cliqueID != "" { @@ -271,6 +278,10 @@ func (m *ComputeDomainStatusManager) buildNodesFromPods(pods []*corev1.Pod) []*n } } + if IsPodFailed(pod) { + status = nvapi.ComputeDomainStatusFailed + } + nodes = append(nodes, &nvapi.ComputeDomainNode{ Name: pod.Spec.NodeName, IPAddress: pod.Status.PodIP, @@ -363,3 +374,19 @@ func (m *ComputeDomainStatusManager) nodesEqual(a, b []*nvapi.ComputeDomainNode) } return maps.Equal(aMap, bMap) } + +func IsPodFailed(pod *corev1.Pod) bool { + if pod.Status.Phase == corev1.PodFailed { + return true + } + + for _, ctrStatus := range pod.Status.ContainerStatuses { + if ctrStatus.State.Waiting != nil { + switch ctrStatus.State.Waiting.Reason { + case "ErrImagePull", "ImagePullBackOff", "CrashLoopBackOff": + return true + } + } + } + return false +} diff --git a/cmd/compute-domain-kubelet-plugin/computedomain.go b/cmd/compute-domain-kubelet-plugin/computedomain.go index cd3e4bcba..9624100e4 100644 --- a/cmd/compute-domain-kubelet-plugin/computedomain.go +++ b/cmd/compute-domain-kubelet-plugin/computedomain.go @@ -47,6 +47,11 @@ const ( ComputeDomainDaemonConfigTemplatePath = "/templates/compute-domain-daemon-config.tmpl.cfg" ) +type AssertNameSpaceFunc func(ctx context.Context, claimNamespace, cdUID string) error +type AddNodeLabelFunc func(ctx context.Context, cdUID string) error +type RemoveNodeLabelFunc func(ctx context.Context, cdUID string) error +type AssertComputeDomainReadyFunc func(ctx context.Context, cdUID string) error + type ComputeDomainManager struct { config *Config waitGroup sync.WaitGroup @@ -57,6 +62,8 @@ type ComputeDomainManager struct { configFilesRoot string cliqueID string + + podManager *PodManager } type ComputeDomainDaemonSettings struct { @@ -80,6 +87,8 @@ func NewComputeDomainManager(config *Config, cliqueID string) *ComputeDomainMana cliqueID: cliqueID, } + m.podManager = NewPodManager(config, m.AssertComputeDomainNamespace, m.AddNodeLabel, m.RemoveNodeLabel, m.AssertComputeDomainReady) + return m } @@ -118,10 +127,21 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) { return fmt.Errorf("informer cache sync for ComputeDomains failed") } + if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) { + if err := m.podManager.Start(ctx); err != nil { + return fmt.Errorf("error starting Pod manager: %w", err) + } + } + return nil } func (m *ComputeDomainManager) Stop() error { + if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) { + if err := m.podManager.Stop(); err != nil { + return fmt.Errorf("error stopping Pod manager: %w", err) + } + } if m.cancelContext != nil { m.cancelContext() } @@ -245,7 +265,11 @@ func (m *ComputeDomainManager) AssertComputeDomainReady(ctx context.Context, cdU } // Check if the current node is ready in the ComputeDomain - if !m.isCurrentNodeReady(ctx, cd) { + ready, failed := m.isCurrentNodeReady(ctx, cd) + if failed { + return fmt.Errorf("%w: current node failed in ComputeDomain", ErrBindingFailure) + } + if !ready { return fmt.Errorf("current node not ready in ComputeDomain") } @@ -256,23 +280,28 @@ func (m *ComputeDomainManager) AssertComputeDomainReady(ctx context.Context, cdU // When the feature gate is enabled, we check both the clique and the status to ensure // that compute domains started before the feature gate was enabled continue to work // even after the feature gate is enabled. -func (m *ComputeDomainManager) isCurrentNodeReady(ctx context.Context, cd *nvapi.ComputeDomain) bool { +func (m *ComputeDomainManager) isCurrentNodeReady(ctx context.Context, cd *nvapi.ComputeDomain) (bool, bool) { if featuregates.Enabled(featuregates.ComputeDomainCliques) { if m.isCurrentNodeReadyInClique(ctx, cd) { - return true + return true, false } } return m.isCurrentNodeReadyInStatus(cd) } // isCurrentNodeReadyInStatus checks if the current node is marked as ready in the ComputeDomain status. -func (m *ComputeDomainManager) isCurrentNodeReadyInStatus(cd *nvapi.ComputeDomain) bool { +func (m *ComputeDomainManager) isCurrentNodeReadyInStatus(cd *nvapi.ComputeDomain) (bool, bool) { for _, node := range cd.Status.Nodes { if node.Name == m.config.flags.nodeName { - return node.Status == nvapi.ComputeDomainStatusReady + switch node.Status { + case nvapi.ComputeDomainStatusReady: + return true, false + case nvapi.ComputeDomainStatusFailed: + return false, true + } } } - return false + return false, false } // isCurrentNodeReadyInClique checks if the current node is marked as ready in the ComputeDomainClique. diff --git a/cmd/compute-domain-kubelet-plugin/deviceinfo.go b/cmd/compute-domain-kubelet-plugin/deviceinfo.go index 3d5a96cda..9383d1936 100644 --- a/cmd/compute-domain-kubelet-plugin/deviceinfo.go +++ b/cmd/compute-domain-kubelet-plugin/deviceinfo.go @@ -21,6 +21,8 @@ import ( resourceapi "k8s.io/api/resource/v1" "k8s.io/utils/ptr" + nvapi "sigs.k8s.io/nvidia-dra-driver-gpu/api/nvidia.com/resource/v1beta1" + "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/featuregates" ) type ComputeDomainChannelInfo struct { @@ -59,6 +61,10 @@ func (d *ComputeDomainChannelInfo) GetDevice() resourceapi.Device { }, }, } + if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) { + device.BindingConditions = []string{nvapi.ComputeDomainBindingConditions} + device.BindingFailureConditions = []string{nvapi.ComputeDomainBindingFailureConditions} + } return device } diff --git a/cmd/compute-domain-kubelet-plugin/main.go b/cmd/compute-domain-kubelet-plugin/main.go index a8398e604..73ca0d6d0 100644 --- a/cmd/compute-domain-kubelet-plugin/main.go +++ b/cmd/compute-domain-kubelet-plugin/main.go @@ -36,6 +36,7 @@ import ( "sigs.k8s.io/nvidia-dra-driver-gpu/internal/info" "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/featuregates" pkgflags "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/flags" + "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/workqueue" ) const ( @@ -62,6 +63,8 @@ type Flags struct { type Config struct { flags *Flags clientsets pkgflags.ClientSets + // workQueue manages the asynchronous processing of tasks + workQueue *workqueue.WorkQueue } func (c Config) DriverPluginPath() string { @@ -180,9 +183,12 @@ func newApp() *cli.App { return fmt.Errorf("create client: %w", err) } + workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter()) + config := &Config{ flags: flags, clientsets: clientSets, + workQueue: workQueue, } return RunPlugin(c.Context, config) @@ -245,7 +251,7 @@ func RunPlugin(ctx context.Context, config *Config) error { return fmt.Errorf("error creating driver: %w", err) } - <-ctx.Done() + config.workQueue.Run(ctx) if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) { // A canceled context is the normal case here when the process receives // a signal. Only log the error for more interesting cases. diff --git a/cmd/compute-domain-kubelet-plugin/pod.go b/cmd/compute-domain-kubelet-plugin/pod.go new file mode 100644 index 000000000..6c6a1dfaa --- /dev/null +++ b/cmd/compute-domain-kubelet-plugin/pod.go @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "context" + "errors" + "fmt" + "slices" + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + resourcev1 "k8s.io/api/resource/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/informers" + corev1listers "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" + nvapi "sigs.k8s.io/nvidia-dra-driver-gpu/api/nvidia.com/resource/v1beta1" +) + +var ErrBindingFailure = errors.New("binding failure") + +type PodManager struct { + config *Config + waitGroup sync.WaitGroup + cancelContext context.CancelFunc + + factory informers.SharedInformerFactory + informer cache.SharedIndexInformer + lister corev1listers.PodLister + + assertNamespace AssertNameSpaceFunc + addNodeLabel AddNodeLabelFunc + removeNodeLabel RemoveNodeLabelFunc + assertComputeDomainReady AssertComputeDomainReadyFunc +} + +func NewPodManager(config *Config, assertNamespace AssertNameSpaceFunc, addNodeLabel AddNodeLabelFunc, removeNodeLabel RemoveNodeLabelFunc, assertComputeDomainReady AssertComputeDomainReadyFunc) *PodManager { + selector := fmt.Sprintf("status.nominatedNodeName=%s", config.flags.nodeName) + + factory := informers.NewSharedInformerFactoryWithOptions( + config.clientsets.Core, + informerResyncPeriod, + informers.WithTweakListOptions(func(options *metav1.ListOptions) { + options.FieldSelector = selector + }), + ) + + informer := factory.Core().V1().Pods().Informer() + lister := factory.Core().V1().Pods().Lister() + + return &PodManager{ + config: config, + factory: factory, + informer: informer, + lister: lister, + assertNamespace: assertNamespace, + addNodeLabel: addNodeLabel, + removeNodeLabel: removeNodeLabel, + assertComputeDomainReady: assertComputeDomainReady, + } +} + +func (m *PodManager) Start(ctx context.Context) error { + ctx, cancel := context.WithCancel(ctx) + m.cancelContext = cancel + + _, err := m.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj any) { + pod, ok := obj.(*corev1.Pod) + if !ok { + return + } + m.config.workQueue.EnqueueWithKey(obj, fmt.Sprintf("%s/%s", pod.Namespace, pod.Name), m.onAddOrUpdate) + }, + UpdateFunc: func(oldObj, newObj any) { + pod, ok := newObj.(*corev1.Pod) + if !ok { + return + } + m.config.workQueue.EnqueueWithKey(newObj, fmt.Sprintf("%s/%s", pod.Namespace, pod.Name), m.onAddOrUpdate) + }, + }) + if err != nil { + return fmt.Errorf("error adding event handlers for Pod informer: %w", err) + } + + m.waitGroup.Add(1) + go func() { + defer m.waitGroup.Done() + m.informer.Run(ctx.Done()) + }() + + if !cache.WaitForCacheSync(ctx.Done(), m.informer.HasSynced) { + return fmt.Errorf("informer cache sync for Pod failed") + } + + return nil +} + +func (m *PodManager) Stop() error { + if m.cancelContext != nil { + m.cancelContext() + } + m.waitGroup.Wait() + return nil +} + +func (m *PodManager) onAddOrUpdate(ctx context.Context, obj any) error { + originalPod, ok := obj.(*corev1.Pod) + if !ok { + return fmt.Errorf("failed to cast to Pod") + } + + // Get the latest Pod to avoid running reconcile on already deleted pods + pod, err := m.lister.Pods(originalPod.Namespace).Get(originalPod.Name) + if apierrors.IsNotFound(err) { + klog.V(2).Infof("Pod (%s/%s) was removed", originalPod.Namespace, originalPod.Name) + return nil + } + if err != nil { + return fmt.Errorf("error getting Pod: %w", err) + } + + if pod.GetDeletionTimestamp() != nil { + return nil + } + + klog.V(2).Infof("Processing added or updated Pod: %s/%s", pod.Namespace, pod.Name) + + // Get all ResourceClaims associated with the Pod + rcs, err := m.GetResourceClaims(ctx, pod) + if err != nil { + return fmt.Errorf("error getting ResourceClaim: %w", err) + } + + var config *nvapi.ComputeDomainChannelConfig + var targetRC *resourcev1.ResourceClaim + for _, rc := range rcs { + // Extract the target ResourceClaim from the Pod's ResourceClaims and get the corresponding ComputeDomainChannelConfig + config, err = m.getComputeDomainChannelRequestConfig(rc) + if err != nil { + return fmt.Errorf("error getting config for ComputeDomainChannel request from ResourceClaim %s/%s: %w", rc.Namespace, rc.Name, err) + } + if config != nil { + targetRC = rc + break + } + } + + // Exit if Pod does not have ResourceClaim allocated to channel device + if config == nil { + return nil + } + + // Get domain id + domainID := config.DomainID + + // Check that ComputeDomain and ResourceClaim have the same namespace + err = m.assertNamespace(ctx, targetRC.Namespace, domainID) + if err != nil { + klog.Errorf("failed to assert Namespace for computeDomain with domainID %s and ResourceClaim %s/%s: %s", domainID, targetRC.Namespace, targetRC.Name, err.Error()) + return nil + } + + // Add node label to start IMEX DaemonSet pod + if err := m.addNodeLabel(ctx, domainID); err != nil { + return fmt.Errorf("error adding Node label for ComputeDomain: %w", err) + } + + // Check ComputeDomain.status.nodes.status is ready + err = m.assertComputeDomainReady(ctx, domainID) + + switch { + // Ready + case err == nil: + if err := m.SetBindingConditions(ctx, targetRC.Name, targetRC.Namespace, nvapi.ComputeDomainBindingConditions); err != nil { + return fmt.Errorf("error setting BindingConditions to ResourceClaim: %w", err) + } + // Failed + case errors.Is(err, ErrBindingFailure): + if err := m.SetBindingConditions(ctx, targetRC.Name, targetRC.Namespace, nvapi.ComputeDomainBindingFailureConditions); err != nil { + return fmt.Errorf("error setting BindingFailureConditions to ResourceClaim: %w", err) + } + if err := m.removeNodeLabel(ctx, domainID); err != nil { + return fmt.Errorf("error removing Node label for ComputeDomain: %w", err) + } + klog.V(2).Infof("asserting ComputeDomain Ready: %v", err) + default: + return fmt.Errorf("error asserting ComputeDomain Ready: %w", err) + } + + return nil +} + +func (m *PodManager) GetResourceClaims(ctx context.Context, pod *corev1.Pod) ([]*resourcev1.ResourceClaim, error) { + rcStatuses := pod.Status.ResourceClaimStatuses + var rcs []*resourcev1.ResourceClaim + for _, rcStatus := range rcStatuses { + rc, err := m.config.clientsets.Resource.ResourceClaims(pod.Namespace).Get(ctx, *rcStatus.ResourceClaimName, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("error Get API for ResourceClaim: %w", err) + } + rcs = append(rcs, rc) + } + return rcs, nil +} + +// getComputeDomainChannelResult determines if a ResourceClaim is a monitoring target by filtering +// the allocationResult and returns the config information associated with the target allocationResult. +// +// The processing target must meet the following conditions: +// - The driver is "compute-domain.nvidia.com" +// - The device is a channel device (determined by whether its corresponding config is ComputeDomainChannelConfig) +// - The device has BindingConditions +// - The device is not set BindingConditions or BindingFailureConditions +func (m *PodManager) getComputeDomainChannelRequestConfig(rc *resourcev1.ResourceClaim) (*nvapi.ComputeDomainChannelConfig, error) { + if rc.Status.Allocation == nil || len(rc.Status.ReservedFor) == 0 { + return nil, fmt.Errorf("error ResourceClaim has no status") + } + + configs, err := GetOpaqueDeviceConfigs( + nvapi.StrictDecoder, + DriverName, + rc.Status.Allocation.Devices.Config, + ) + if err != nil { + return nil, err + } + + for _, result := range rc.Status.Allocation.Devices.Results { + // Check the driver + if result.Driver != DriverName { + continue + } + // Check the device is channel device + for _, c := range slices.Backward(configs) { + if !slices.Contains(c.Requests, result.Request) { + continue + } + + channelConfig, ok := c.Config.(*nvapi.ComputeDomainChannelConfig) + if !ok { + continue + } + + if !slices.Contains(result.BindingConditions, nvapi.ComputeDomainBindingConditions) { + continue + } + + if IsBindingConditionsAlreadySet(rc, &result) { + continue + } + + return channelConfig, nil + } + } + + return nil, nil +} + +func (m *PodManager) SetBindingConditions(ctx context.Context, rcName, rcNamespace string, conditionType string) error { + rc, err := m.config.clientsets.Resource.ResourceClaims(rcNamespace).Get(ctx, rcName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get ResourceClaim %s/%s: %w", rcNamespace, rcName, err) + } + newRC := rc.DeepCopy() + if len(newRC.Status.Devices) == 0 { + for _, allocationDevice := range newRC.Status.Allocation.Devices.Results { + device := &resourcev1.AllocatedDeviceStatus{ + Driver: allocationDevice.Driver, + Pool: allocationDevice.Pool, + Device: allocationDevice.Device, + } + newRC.Status.Devices = append(newRC.Status.Devices, *device) + } + } + + if len(newRC.Status.Devices) == 0 { + return nil + } + + var reason string + var message string + switch conditionType { + case nvapi.ComputeDomainBindingConditions: + reason = "ComputeDomainSettingsSucceeded" + message = "binding succeeded — ComputeDomain status ready" + case nvapi.ComputeDomainBindingFailureConditions: + reason = "ComputeDomainSettingsFailed" + message = "binding failed — ComputeDomain status failed" + } + + for i := range newRC.Status.Devices { + device := &newRC.Status.Devices[i] + newCondition := metav1.Condition{ + Type: conditionType, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: reason, + Message: fmt.Sprintf("Device %s: %s", device.Device, message), + } + conditionExists := false + for j, existingCond := range device.Conditions { + if existingCond.Type == conditionType { + if existingCond.Status != newCondition.Status { + device.Conditions[j] = newCondition + } + conditionExists = true + break + } + } + if !conditionExists { + device.Conditions = append(device.Conditions, newCondition) + } + } + + _, err = m.config.clientsets.Resource.ResourceClaims(newRC.Namespace).UpdateStatus(ctx, newRC, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update ResourceClaim %s/%s status with binding conditions: %w", newRC.Namespace, newRC.Name, err) + } + return nil +} + +func IsBindingConditionsAlreadySet(rc *resourcev1.ResourceClaim, allocResult *resourcev1.DeviceRequestAllocationResult) bool { + for _, deviceStatus := range rc.Status.Devices { + if deviceStatus.Driver == allocResult.Driver && deviceStatus.Pool == allocResult.Pool && deviceStatus.Device == allocResult.Device { + for _, cond := range deviceStatus.Conditions { + // Check the device is not set BindingConditions + if cond.Type == nvapi.ComputeDomainBindingConditions && cond.Status == metav1.ConditionTrue { + return true + } + // Check the device is not set BindingFailureConditions + if cond.Type == nvapi.ComputeDomainBindingFailureConditions && cond.Status == metav1.ConditionTrue { + return true + } + } + } + } + + return false +} diff --git a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml index 5a28ae17c..43c5273ec 100644 --- a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml +++ b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml @@ -136,6 +136,7 @@ spec: enum: - Ready - NotReady + - Failed type: string required: - cliqueID diff --git a/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml b/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml index eb7c95cea..61bffc1b4 100644 --- a/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml +++ b/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml @@ -11,6 +11,9 @@ rules: - apiGroups: ["resource.k8s.io"] resources: ["resourceclaims"] verbs: ["get", "list", "watch"] +- apiGroups: ["resource.k8s.io"] + resources: ["resourceclaims/status"] + verbs: ["update"] - apiGroups: ["resource.k8s.io"] resources: ["resourceslices"] verbs: ["get", "list", "watch", "create", "update", "delete"] diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go index ed0ca3586..ed971d96a 100644 --- a/pkg/featuregates/featuregates.go +++ b/pkg/featuregates/featuregates.go @@ -55,6 +55,10 @@ const ( // CrashOnNVLinkFabricErrors causes the kubelet plugin to crash instead of // falling back to non-fabric mode when NVLink fabric errors are detected. CrashOnNVLinkFabricErrors featuregate.Feature = "CrashOnNVLinkFabricErrors" + + // ComputeDomainBindingConditions enables scheduling of workload pods with channel devices + // to be delayed by DRADeviceBindingConditions until the IMEX Daemon Pods complete their processing. + ComputeDomainBindingConditions featuregate.Feature = "ComputeDomainBindingConditions" ) // defaultFeatureGates contains the default settings for all project-specific feature gates. @@ -116,6 +120,13 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.VersionedSpecs{ Version: version.MajorMinor(25, 12), }, }, + ComputeDomainBindingConditions: { + { + Default: false, + PreRelease: featuregate.Alpha, + Version: version.MajorMinor(26, 4), + }, + }, } var (