From 7a7bdb0d292da8309751c90adf08855e475970f7 Mon Sep 17 00:00:00 2001
From: Tsubasa Watanabe <w.tsubasa@fujitsu.com>
Date: Thu, 5 Mar 2026 18:28:44 +0900
Subject: [PATCH] feat: Device Binding Conditions for ComputeDomain

Added a feature gate for ComputeDomainBindingConditions.
When this feature gate is enabled, the following functionality is activated.

  - Publish BindingConditions on channel devices in ComputeDomain ResourceSlices
  - Add PodManager that uses fieldSelector on status.nominatedNodeName to
    efficiently monitor Pods with ResourceClaims containing BindingConditions.
  - Set BindingConditions when ComputeDomain node become Ready

Signed-off-by: Tsubasa Watanabe <w.tsubasa@fujitsu.com>
---
 .../resource/v1beta1/computedomain.go         |   6 +-
 cmd/compute-domain-controller/cdstatus.go     |  27 ++
 .../computedomain.go                          |  41 +-
 .../deviceinfo.go                             |   6 +
 cmd/compute-domain-kubelet-plugin/main.go     |   8 +-
 cmd/compute-domain-kubelet-plugin/pod.go      | 359 ++++++++++++++++++
 .../resource.nvidia.com_computedomains.yaml   |   1 +
 .../templates/rbac-kubeletplugin.yaml         |   3 +
 pkg/featuregates/featuregates.go              |  11 +
 9 files changed, 454 insertions(+), 8 deletions(-)
 create mode 100644 cmd/compute-domain-kubelet-plugin/pod.go

diff --git a/api/nvidia.com/resource/v1beta1/computedomain.go b/api/nvidia.com/resource/v1beta1/computedomain.go
index d373befef..095209fb2 100644
--- a/api/nvidia.com/resource/v1beta1/computedomain.go
+++ b/api/nvidia.com/resource/v1beta1/computedomain.go
@@ -24,9 +24,13 @@ const (
 	ComputeDomainStatusNone     = ""
 	ComputeDomainStatusReady    = "Ready"
 	ComputeDomainStatusNotReady = "NotReady"
+	ComputeDomainStatusFailed   = "Failed"
 
 	ComputeDomainChannelAllocationModeSingle = "Single"
 	ComputeDomainChannelAllocationModeAll    = "All"
+
+	ComputeDomainBindingConditions        = "ComputeDomainReady"
+	ComputeDomainBindingFailureConditions = "ComputeDomainNotReady"
 )
 
 // +genclient
@@ -134,7 +138,7 @@ type ComputeDomainNode struct {
 	// it is not. It is marked as optional in order to support downgrades
 	// and avoid an API bump.
 	// +kubebuilder:validation:Optional
-	// +kubebuilder:validation:Enum=Ready;NotReady
+	// +kubebuilder:validation:Enum=Ready;NotReady;Failed
 	// +kubebuilder:default:=NotReady
 	Status string `json:"status,omitempty"`
 }
diff --git a/cmd/compute-domain-controller/cdstatus.go b/cmd/compute-domain-controller/cdstatus.go
index 464246f28..a44a216fb 100644
--- a/cmd/compute-domain-controller/cdstatus.go
+++ b/cmd/compute-domain-controller/cdstatus.go
@@ -181,6 +181,13 @@ func (m *ComputeDomainStatusManager) sync(ctx context.Context) {
 			continue
 		}
 
+		if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) {
+			if IsPodFailed(pod) {
+				nonFabricPodsByCD[cdUID] = append(nonFabricPodsByCD[cdUID], pod)
+				continue
+			}
+		}
+
 		// Separate pods based on cliqueID label
 		cliqueID, exists := pod.Labels[computeDomainCliqueLabelKey]
 		if !exists || cliqueID != "" {
@@ -271,6 +278,10 @@ func (m *ComputeDomainStatusManager) buildNodesFromPods(pods []*corev1.Pod) []*n
 			}
 		}
 
+		if IsPodFailed(pod) {
+			status = nvapi.ComputeDomainStatusFailed
+		}
+
 		nodes = append(nodes, &nvapi.ComputeDomainNode{
 			Name:      pod.Spec.NodeName,
 			IPAddress: pod.Status.PodIP,
@@ -363,3 +374,19 @@ func (m *ComputeDomainStatusManager) nodesEqual(a, b []*nvapi.ComputeDomainNode)
 	}
 	return maps.Equal(aMap, bMap)
 }
+
+func IsPodFailed(pod *corev1.Pod) bool {
+	if pod.Status.Phase == corev1.PodFailed {
+		return true
+	}
+
+	for _, ctrStatus := range pod.Status.ContainerStatuses {
+		if ctrStatus.State.Waiting != nil {
+			switch ctrStatus.State.Waiting.Reason {
+			case "ErrImagePull", "ImagePullBackOff", "CrashLoopBackOff":
+				return true
+			}
+		}
+	}
+	return false
+}
diff --git a/cmd/compute-domain-kubelet-plugin/computedomain.go b/cmd/compute-domain-kubelet-plugin/computedomain.go
index cd3e4bcba..9624100e4 100644
--- a/cmd/compute-domain-kubelet-plugin/computedomain.go
+++ b/cmd/compute-domain-kubelet-plugin/computedomain.go
@@ -47,6 +47,11 @@ const (
 	ComputeDomainDaemonConfigTemplatePath = "/templates/compute-domain-daemon-config.tmpl.cfg"
 )
 
+type AssertNameSpaceFunc func(ctx context.Context, claimNamespace, cdUID string) error
+type AddNodeLabelFunc func(ctx context.Context, cdUID string) error
+type RemoveNodeLabelFunc func(ctx context.Context, cdUID string) error
+type AssertComputeDomainReadyFunc func(ctx context.Context, cdUID string) error
+
 type ComputeDomainManager struct {
 	config        *Config
 	waitGroup     sync.WaitGroup
@@ -57,6 +62,8 @@ type ComputeDomainManager struct {
 
 	configFilesRoot string
 	cliqueID        string
+
+	podManager *PodManager
 }
 
 type ComputeDomainDaemonSettings struct {
@@ -80,6 +87,8 @@ func NewComputeDomainManager(config *Config, cliqueID string) *ComputeDomainMana
 		cliqueID:        cliqueID,
 	}
 
+	m.podManager = NewPodManager(config, m.AssertComputeDomainNamespace, m.AddNodeLabel, m.RemoveNodeLabel, m.AssertComputeDomainReady)
+
 	return m
 }
 
@@ -118,10 +127,21 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
 		return fmt.Errorf("informer cache sync for ComputeDomains failed")
 	}
 
+	if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) {
+		if err := m.podManager.Start(ctx); err != nil {
+			return fmt.Errorf("error starting Pod manager: %w", err)
+		}
+	}
+
 	return nil
 }
 
 func (m *ComputeDomainManager) Stop() error {
+	if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) {
+		if err := m.podManager.Stop(); err != nil {
+			return fmt.Errorf("error stopping Pod manager: %w", err)
+		}
+	}
 	if m.cancelContext != nil {
 		m.cancelContext()
 	}
@@ -245,7 +265,11 @@ func (m *ComputeDomainManager) AssertComputeDomainReady(ctx context.Context, cdU
 	}
 
 	// Check if the current node is ready in the ComputeDomain
-	if !m.isCurrentNodeReady(ctx, cd) {
+	ready, failed := m.isCurrentNodeReady(ctx, cd)
+	if failed {
+		return fmt.Errorf("%w: current node failed in ComputeDomain", ErrBindingFailure)
+	}
+	if !ready {
 		return fmt.Errorf("current node not ready in ComputeDomain")
 	}
 
@@ -256,23 +280,28 @@ func (m *ComputeDomainManager) AssertComputeDomainReady(ctx context.Context, cdU
 // When the feature gate is enabled, we check both the clique and the status to ensure
 // that compute domains started before the feature gate was enabled continue to work
 // even after the feature gate is enabled.
-func (m *ComputeDomainManager) isCurrentNodeReady(ctx context.Context, cd *nvapi.ComputeDomain) bool {
+func (m *ComputeDomainManager) isCurrentNodeReady(ctx context.Context, cd *nvapi.ComputeDomain) (bool, bool) {
 	if featuregates.Enabled(featuregates.ComputeDomainCliques) {
 		if m.isCurrentNodeReadyInClique(ctx, cd) {
-			return true
+			return true, false
 		}
 	}
 	return m.isCurrentNodeReadyInStatus(cd)
 }
 
 // isCurrentNodeReadyInStatus checks if the current node is marked as ready in the ComputeDomain status.
-func (m *ComputeDomainManager) isCurrentNodeReadyInStatus(cd *nvapi.ComputeDomain) bool {
+func (m *ComputeDomainManager) isCurrentNodeReadyInStatus(cd *nvapi.ComputeDomain) (bool, bool) {
 	for _, node := range cd.Status.Nodes {
 		if node.Name == m.config.flags.nodeName {
-			return node.Status == nvapi.ComputeDomainStatusReady
+			switch node.Status {
+			case nvapi.ComputeDomainStatusReady:
+				return true, false
+			case nvapi.ComputeDomainStatusFailed:
+				return false, true
+			}
 		}
 	}
-	return false
+	return false, false
 }
 
 // isCurrentNodeReadyInClique checks if the current node is marked as ready in the ComputeDomainClique.
diff --git a/cmd/compute-domain-kubelet-plugin/deviceinfo.go b/cmd/compute-domain-kubelet-plugin/deviceinfo.go
index 3d5a96cda..9383d1936 100644
--- a/cmd/compute-domain-kubelet-plugin/deviceinfo.go
+++ b/cmd/compute-domain-kubelet-plugin/deviceinfo.go
@@ -21,6 +21,8 @@ import (
 
 	resourceapi "k8s.io/api/resource/v1"
 	"k8s.io/utils/ptr"
+	nvapi "sigs.k8s.io/nvidia-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
+	"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/featuregates"
 )
 
 type ComputeDomainChannelInfo struct {
@@ -59,6 +61,10 @@ func (d *ComputeDomainChannelInfo) GetDevice() resourceapi.Device {
 			},
 		},
 	}
+	if featuregates.Enabled(featuregates.ComputeDomainBindingConditions) {
+		device.BindingConditions = []string{nvapi.ComputeDomainBindingConditions}
+		device.BindingFailureConditions = []string{nvapi.ComputeDomainBindingFailureConditions}
+	}
 	return device
 }
 
diff --git a/cmd/compute-domain-kubelet-plugin/main.go b/cmd/compute-domain-kubelet-plugin/main.go
index a8398e604..73ca0d6d0 100644
--- a/cmd/compute-domain-kubelet-plugin/main.go
+++ b/cmd/compute-domain-kubelet-plugin/main.go
@@ -36,6 +36,7 @@ import (
 	"sigs.k8s.io/nvidia-dra-driver-gpu/internal/info"
 	"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/featuregates"
 	pkgflags "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/flags"
+	"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/workqueue"
 )
 
 const (
@@ -62,6 +63,8 @@ type Flags struct {
 type Config struct {
 	flags      *Flags
 	clientsets pkgflags.ClientSets
+	// workQueue manages the asynchronous processing of tasks
+	workQueue *workqueue.WorkQueue
 }
 
 func (c Config) DriverPluginPath() string {
@@ -180,9 +183,12 @@ func newApp() *cli.App {
 				return fmt.Errorf("create client: %w", err)
 			}
 
+			workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter())
+
 			config := &Config{
 				flags:      flags,
 				clientsets: clientSets,
+				workQueue:  workQueue,
 			}
 
 			return RunPlugin(c.Context, config)
@@ -245,7 +251,7 @@ func RunPlugin(ctx context.Context, config *Config) error {
 		return fmt.Errorf("error creating driver: %w", err)
 	}
 
-	<-ctx.Done()
+	config.workQueue.Run(ctx)
 	if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
 		// A canceled context is the normal case here when the process receives
 		// a signal. Only log the error for more interesting cases.
diff --git a/cmd/compute-domain-kubelet-plugin/pod.go b/cmd/compute-domain-kubelet-plugin/pod.go
new file mode 100644
index 000000000..6c6a1dfaa
--- /dev/null
+++ b/cmd/compute-domain-kubelet-plugin/pod.go
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"slices"
+	"sync"
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+	resourcev1 "k8s.io/api/resource/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/informers"
+	corev1listers "k8s.io/client-go/listers/core/v1"
+	"k8s.io/client-go/tools/cache"
+	"k8s.io/klog/v2"
+	nvapi "sigs.k8s.io/nvidia-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
+)
+
+var ErrBindingFailure = errors.New("binding failure")
+
+type PodManager struct {
+	config        *Config
+	waitGroup     sync.WaitGroup
+	cancelContext context.CancelFunc
+
+	factory  informers.SharedInformerFactory
+	informer cache.SharedIndexInformer
+	lister   corev1listers.PodLister
+
+	assertNamespace          AssertNameSpaceFunc
+	addNodeLabel             AddNodeLabelFunc
+	removeNodeLabel          RemoveNodeLabelFunc
+	assertComputeDomainReady AssertComputeDomainReadyFunc
+}
+
+func NewPodManager(config *Config, assertNamespace AssertNameSpaceFunc, addNodeLabel AddNodeLabelFunc, removeNodeLabel RemoveNodeLabelFunc, assertComputeDomainReady AssertComputeDomainReadyFunc) *PodManager {
+	selector := fmt.Sprintf("status.nominatedNodeName=%s", config.flags.nodeName)
+
+	factory := informers.NewSharedInformerFactoryWithOptions(
+		config.clientsets.Core,
+		informerResyncPeriod,
+		informers.WithTweakListOptions(func(options *metav1.ListOptions) {
+			options.FieldSelector = selector
+		}),
+	)
+
+	informer := factory.Core().V1().Pods().Informer()
+	lister := factory.Core().V1().Pods().Lister()
+
+	return &PodManager{
+		config:                   config,
+		factory:                  factory,
+		informer:                 informer,
+		lister:                   lister,
+		assertNamespace:          assertNamespace,
+		addNodeLabel:             addNodeLabel,
+		removeNodeLabel:          removeNodeLabel,
+		assertComputeDomainReady: assertComputeDomainReady,
+	}
+}
+
+func (m *PodManager) Start(ctx context.Context) error {
+	ctx, cancel := context.WithCancel(ctx)
+	m.cancelContext = cancel
+
+	_, err := m.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
+		AddFunc: func(obj any) {
+			pod, ok := obj.(*corev1.Pod)
+			if !ok {
+				return
+			}
+			m.config.workQueue.EnqueueWithKey(obj, fmt.Sprintf("%s/%s", pod.Namespace, pod.Name), m.onAddOrUpdate)
+		},
+		UpdateFunc: func(oldObj, newObj any) {
+			pod, ok := newObj.(*corev1.Pod)
+			if !ok {
+				return
+			}
+			m.config.workQueue.EnqueueWithKey(newObj, fmt.Sprintf("%s/%s", pod.Namespace, pod.Name), m.onAddOrUpdate)
+		},
+	})
+	if err != nil {
+		return fmt.Errorf("error adding event handlers for Pod informer: %w", err)
+	}
+
+	m.waitGroup.Add(1)
+	go func() {
+		defer m.waitGroup.Done()
+		m.informer.Run(ctx.Done())
+	}()
+
+	if !cache.WaitForCacheSync(ctx.Done(), m.informer.HasSynced) {
+		return fmt.Errorf("informer cache sync for Pod failed")
+	}
+
+	return nil
+}
+
+func (m *PodManager) Stop() error {
+	if m.cancelContext != nil {
+		m.cancelContext()
+	}
+	m.waitGroup.Wait()
+	return nil
+}
+
+func (m *PodManager) onAddOrUpdate(ctx context.Context, obj any) error {
+	originalPod, ok := obj.(*corev1.Pod)
+	if !ok {
+		return fmt.Errorf("failed to cast to Pod")
+	}
+
+	// Get the latest Pod to avoid running reconcile on already deleted pods
+	pod, err := m.lister.Pods(originalPod.Namespace).Get(originalPod.Name)
+	if apierrors.IsNotFound(err) {
+		klog.V(2).Infof("Pod (%s/%s) was removed", originalPod.Namespace, originalPod.Name)
+		return nil
+	}
+	if err != nil {
+		return fmt.Errorf("error getting Pod: %w", err)
+	}
+
+	if pod.GetDeletionTimestamp() != nil {
+		return nil
+	}
+
+	klog.V(2).Infof("Processing added or updated Pod: %s/%s", pod.Namespace, pod.Name)
+
+	// Get all ResourceClaims associated with the Pod
+	rcs, err := m.GetResourceClaims(ctx, pod)
+	if err != nil {
+		return fmt.Errorf("error getting ResourceClaim: %w", err)
+	}
+
+	var config *nvapi.ComputeDomainChannelConfig
+	var targetRC *resourcev1.ResourceClaim
+	for _, rc := range rcs {
+		// Extract the target ResourceClaim from the Pod's ResourceClaims and get the corresponding ComputeDomainChannelConfig
+		config, err = m.getComputeDomainChannelRequestConfig(rc)
+		if err != nil {
+			return fmt.Errorf("error getting config for ComputeDomainChannel request from ResourceClaim %s/%s: %w", rc.Namespace, rc.Name, err)
+		}
+		if config != nil {
+			targetRC = rc
+			break
+		}
+	}
+
+	// Exit if Pod does not have ResourceClaim allocated to channel device
+	if config == nil {
+		return nil
+	}
+
+	// Get domain id
+	domainID := config.DomainID
+
+	// Check that ComputeDomain and ResourceClaim have the same namespace
+	err = m.assertNamespace(ctx, targetRC.Namespace, domainID)
+	if err != nil {
+		klog.Errorf("failed to assert Namespace for computeDomain with domainID %s and ResourceClaim %s/%s: %s", domainID, targetRC.Namespace, targetRC.Name, err.Error())
+		return nil
+	}
+
+	// Add node label to start IMEX DaemonSet pod
+	if err := m.addNodeLabel(ctx, domainID); err != nil {
+		return fmt.Errorf("error adding Node label for ComputeDomain: %w", err)
+	}
+
+	// Check ComputeDomain.status.nodes.status is ready
+	err = m.assertComputeDomainReady(ctx, domainID)
+
+	switch {
+	// Ready
+	case err == nil:
+		if err := m.SetBindingConditions(ctx, targetRC.Name, targetRC.Namespace, nvapi.ComputeDomainBindingConditions); err != nil {
+			return fmt.Errorf("error setting BindingConditions to ResourceClaim: %w", err)
+		}
+	// Failed
+	case errors.Is(err, ErrBindingFailure):
+		if err := m.SetBindingConditions(ctx, targetRC.Name, targetRC.Namespace, nvapi.ComputeDomainBindingFailureConditions); err != nil {
+			return fmt.Errorf("error setting BindingFailureConditions to ResourceClaim: %w", err)
+		}
+		if err := m.removeNodeLabel(ctx, domainID); err != nil {
+			return fmt.Errorf("error removing Node label for ComputeDomain: %w", err)
+		}
+		klog.V(2).Infof("asserting ComputeDomain Ready: %v", err)
+	default:
+		return fmt.Errorf("error asserting ComputeDomain Ready: %w", err)
+	}
+
+	return nil
+}
+
+func (m *PodManager) GetResourceClaims(ctx context.Context, pod *corev1.Pod) ([]*resourcev1.ResourceClaim, error) {
+	rcStatuses := pod.Status.ResourceClaimStatuses
+	var rcs []*resourcev1.ResourceClaim
+	for _, rcStatus := range rcStatuses {
+		rc, err := m.config.clientsets.Resource.ResourceClaims(pod.Namespace).Get(ctx, *rcStatus.ResourceClaimName, metav1.GetOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("error Get API for ResourceClaim: %w", err)
+		}
+		rcs = append(rcs, rc)
+	}
+	return rcs, nil
+}
+
+// getComputeDomainChannelResult determines if a ResourceClaim is a monitoring target by filtering
+// the allocationResult and returns the config information associated with the target allocationResult.
+//
+// The processing target must meet the following conditions:
+// - The driver is "compute-domain.nvidia.com"
+// - The device is a channel device (determined by whether its corresponding config is ComputeDomainChannelConfig)
+// - The device has BindingConditions
+// - The device is not set BindingConditions or BindingFailureConditions
+func (m *PodManager) getComputeDomainChannelRequestConfig(rc *resourcev1.ResourceClaim) (*nvapi.ComputeDomainChannelConfig, error) {
+	if rc.Status.Allocation == nil || len(rc.Status.ReservedFor) == 0 {
+		return nil, fmt.Errorf("error ResourceClaim has no status")
+	}
+
+	configs, err := GetOpaqueDeviceConfigs(
+		nvapi.StrictDecoder,
+		DriverName,
+		rc.Status.Allocation.Devices.Config,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	for _, result := range rc.Status.Allocation.Devices.Results {
+		// Check the driver
+		if result.Driver != DriverName {
+			continue
+		}
+		// Check the device is channel device
+		for _, c := range slices.Backward(configs) {
+			if !slices.Contains(c.Requests, result.Request) {
+				continue
+			}
+
+			channelConfig, ok := c.Config.(*nvapi.ComputeDomainChannelConfig)
+			if !ok {
+				continue
+			}
+
+			if !slices.Contains(result.BindingConditions, nvapi.ComputeDomainBindingConditions) {
+				continue
+			}
+
+			if IsBindingConditionsAlreadySet(rc, &result) {
+				continue
+			}
+
+			return channelConfig, nil
+		}
+	}
+
+	return nil, nil
+}
+
+func (m *PodManager) SetBindingConditions(ctx context.Context, rcName, rcNamespace string, conditionType string) error {
+	rc, err := m.config.clientsets.Resource.ResourceClaims(rcNamespace).Get(ctx, rcName, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to get ResourceClaim %s/%s: %w", rcNamespace, rcName, err)
+	}
+	newRC := rc.DeepCopy()
+	if len(newRC.Status.Devices) == 0 {
+		for _, allocationDevice := range newRC.Status.Allocation.Devices.Results {
+			device := &resourcev1.AllocatedDeviceStatus{
+				Driver: allocationDevice.Driver,
+				Pool:   allocationDevice.Pool,
+				Device: allocationDevice.Device,
+			}
+			newRC.Status.Devices = append(newRC.Status.Devices, *device)
+		}
+	}
+
+	if len(newRC.Status.Devices) == 0 {
+		return nil
+	}
+
+	var reason string
+	var message string
+	switch conditionType {
+	case nvapi.ComputeDomainBindingConditions:
+		reason = "ComputeDomainSettingsSucceeded"
+		message = "binding succeeded — ComputeDomain status ready"
+	case nvapi.ComputeDomainBindingFailureConditions:
+		reason = "ComputeDomainSettingsFailed"
+		message = "binding failed — ComputeDomain status failed"
+	}
+
+	for i := range newRC.Status.Devices {
+		device := &newRC.Status.Devices[i]
+		newCondition := metav1.Condition{
+			Type:               conditionType,
+			Status:             metav1.ConditionTrue,
+			LastTransitionTime: metav1.NewTime(time.Now()),
+			Reason:             reason,
+			Message:            fmt.Sprintf("Device %s: %s", device.Device, message),
+		}
+		conditionExists := false
+		for j, existingCond := range device.Conditions {
+			if existingCond.Type == conditionType {
+				if existingCond.Status != newCondition.Status {
+					device.Conditions[j] = newCondition
+				}
+				conditionExists = true
+				break
+			}
+		}
+		if !conditionExists {
+			device.Conditions = append(device.Conditions, newCondition)
+		}
+	}
+
+	_, err = m.config.clientsets.Resource.ResourceClaims(newRC.Namespace).UpdateStatus(ctx, newRC, metav1.UpdateOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to update ResourceClaim %s/%s status with binding conditions: %w", newRC.Namespace, newRC.Name, err)
+	}
+	return nil
+}
+
+func IsBindingConditionsAlreadySet(rc *resourcev1.ResourceClaim, allocResult *resourcev1.DeviceRequestAllocationResult) bool {
+	for _, deviceStatus := range rc.Status.Devices {
+		if deviceStatus.Driver == allocResult.Driver && deviceStatus.Pool == allocResult.Pool && deviceStatus.Device == allocResult.Device {
+			for _, cond := range deviceStatus.Conditions {
+				// Check the device is not set BindingConditions
+				if cond.Type == nvapi.ComputeDomainBindingConditions && cond.Status == metav1.ConditionTrue {
+					return true
+				}
+				// Check the device is not set BindingFailureConditions
+				if cond.Type == nvapi.ComputeDomainBindingFailureConditions && cond.Status == metav1.ConditionTrue {
+					return true
+				}
+			}
+		}
+	}
+
+	return false
+}
diff --git a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml
index 5a28ae17c..43c5273ec 100644
--- a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml
+++ b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml
@@ -136,6 +136,7 @@ spec:
                       enum:
                       - Ready
                       - NotReady
+                      - Failed
                       type: string
                   required:
                   - cliqueID
diff --git a/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml b/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml
index eb7c95cea..61bffc1b4 100644
--- a/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml
+++ b/deployments/helm/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml
@@ -11,6 +11,9 @@ rules:
 - apiGroups: ["resource.k8s.io"]
   resources: ["resourceclaims"]
   verbs: ["get", "list", "watch"]
+- apiGroups: ["resource.k8s.io"]
+  resources: ["resourceclaims/status"]
+  verbs: ["update"]
 - apiGroups: ["resource.k8s.io"]
   resources: ["resourceslices"]
   verbs: ["get", "list", "watch", "create", "update", "delete"]
diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go
index ed0ca3586..ed971d96a 100644
--- a/pkg/featuregates/featuregates.go
+++ b/pkg/featuregates/featuregates.go
@@ -55,6 +55,10 @@ const (
 	// CrashOnNVLinkFabricErrors causes the kubelet plugin to crash instead of
 	// falling back to non-fabric mode when NVLink fabric errors are detected.
 	CrashOnNVLinkFabricErrors featuregate.Feature = "CrashOnNVLinkFabricErrors"
+
+	// ComputeDomainBindingConditions enables scheduling of workload pods with channel devices
+	// to be delayed by DRADeviceBindingConditions until the IMEX Daemon Pods complete their processing.
+	ComputeDomainBindingConditions featuregate.Feature = "ComputeDomainBindingConditions"
 )
 
 // defaultFeatureGates contains the default settings for all project-specific feature gates.
@@ -116,6 +120,13 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.VersionedSpecs{
 			Version:    version.MajorMinor(25, 12),
 		},
 	},
+	ComputeDomainBindingConditions: {
+		{
+			Default:    false,
+			PreRelease: featuregate.Alpha,
+			Version:    version.MajorMinor(26, 4),
+		},
+	},
 }
 
 var (