Skip to content

Commit 9952c7b

Browse files
committed
draclient: do not fail CNI when a DRA claim contributes nothing
- If every allocation result for a claim is skipped (no slice match, missing Multus deviceID/resourceName, etc.), log a warning and continue to the next claim instead of returning an error, so kubelet/device-plugin entries stay usable (hybrid legacy VF + broken or irrelevant DRA claims). - getDeviceInfo: treat missing multus deviceID on a matched device as the same skippable sentinel as “not in slice”, avoiding a misleading final error. - Tests: expect nil error for unmapped claims; add case preserving pre-filled resource map entries; keep VF+GPU success with empty GPU slice. - k8sclient: stub kubelet ResourceClient in DRA failure test; logging: Warningf. Signed-off-by: Fred Rolland <frolland@nvidia.com>
1 parent 22411b3 commit 9952c7b

2 files changed

Lines changed: 181 additions & 18 deletions

File tree

pkg/draclient/draclient.go

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package draclient
1616

1717
import (
1818
"context"
19+
"errors"
1920
"fmt"
2021
"time"
2122

@@ -33,6 +34,11 @@ const (
3334
multusResourceNameAttr = "k8s.cni.cncf.io/resourceName"
3435
)
3536

37+
// errDeviceNotInAnySlice is returned when allocation names a device that does not appear
38+
// in any ResourceSlice for that driver/pool (wrapped in getDeviceInfo). Callers may skip
39+
// individual results so multi-device claims (e.g. SR-IOV + GPU) still succeed for CNI.
40+
var errDeviceNotInAnySlice = errors.New("device not in any matching resource slice")
41+
3642
// namespacedClaimCacheKey avoids cache collisions: ResourceClaim is namespaced.
3743
func namespacedClaimCacheKey(namespace, claimName string) string {
3844
return namespace + "/" + claimName
@@ -99,20 +105,29 @@ func (d *draClient) GetPodResourceMap(pod *v1.Pod, resourceMap map[string]*types
99105
return fmt.Errorf("claim %s has no device allocation", claimName)
100106
}
101107

102-
for _, result := range resourceClaim.Status.Allocation.Devices.Results {
108+
results := resourceClaim.Status.Allocation.Devices.Results
109+
resolvedCount := 0
110+
for _, result := range results {
103111
logging.Debugf("GetPodResourceMap: processing device allocation - driver: %s, pool: %s, device: %s, request: %s",
104112
result.Driver, result.Pool, result.Device, result.Request)
105113

106114
info, err := d.getDeviceInfo(ctx, result)
107115
if err != nil {
116+
if errors.Is(err, errDeviceNotInAnySlice) {
117+
logging.Warningf(
118+
"GetPodResourceMap: skipping allocation result for claim %s (driver=%s pool=%s device=%s): %v",
119+
claimName, result.Driver, result.Pool, result.Device, err)
120+
continue
121+
}
108122
logging.Errorf("GetPodResourceMap: failed to get device info for claim %s: %v", claimName, err)
109123
return err
110124
}
111125

112126
if info.ResourceName == "" {
113-
resErr := fmt.Errorf("device %s missing required attribute %s (must match NAD k8s.v1.cni.cncf.io/resourceName)", result.Device, multusResourceNameAttr)
114-
logging.Errorf("GetPodResourceMap: %v", resErr)
115-
return resErr
127+
logging.Warningf(
128+
"GetPodResourceMap: skipping allocation result for claim %s (driver=%s pool=%s device=%s): no %q (only devices published for CNI are mapped)",
129+
claimName, result.Driver, result.Pool, result.Device, multusResourceNameAttr)
130+
continue
116131
}
117132

118133
resourceMapKey := info.ResourceName
@@ -123,6 +138,14 @@ func (d *draClient) GetPodResourceMap(pod *v1.Pod, resourceMap map[string]*types
123138
resourceMap[resourceMapKey] = &types.ResourceInfo{DeviceIDs: []string{info.DeviceID}}
124139
logging.Debugf("GetPodResourceMap: created new resource map entry %s with device ID %s", resourceMapKey, info.DeviceID)
125140
}
141+
resolvedCount++
142+
}
143+
if resolvedCount == 0 && len(results) > 0 {
144+
logging.Warningf(
145+
"GetPodResourceMap: claim %s had no allocation results mapped for Multus (skipping this claim; existing kubelet/device-plugin map entries are kept). "+
146+
"Fix DRA ResourceSlices or Multus attributes if this claim should contribute to CNI.",
147+
claimName)
148+
continue
126149
}
127150
logging.Debugf("GetPodResourceMap: successfully processed resource claim %s", claimName)
128151
}
@@ -259,16 +282,18 @@ func (d *draClient) getDeviceInfo(ctx context.Context, result resourcev1api.Devi
259282
devIDAttr, exists := device.Attributes[multusDeviceIDAttr]
260283
if !exists {
261284
logging.Warningf(
262-
"getDeviceInfo: allocated device %q (driver %q, pool %q) has no %q attribute; DRA drivers must publish this for Multus; skipping device",
285+
"getDeviceInfo: device %q (driver %q, pool %q) has no %q in ResourceSlice; skipping allocation result",
263286
device.Name, result.Driver, result.Pool, multusDeviceIDAttr)
264-
continue
287+
return nil, fmt.Errorf("%w: device %q present in slice but missing %q",
288+
errDeviceNotInAnySlice, device.Name, multusDeviceIDAttr)
265289
}
266290

267291
if devIDAttr.StringValue == nil {
268292
logging.Warningf(
269-
"getDeviceInfo: allocated device %q (driver %q, pool %q) has %q with nil StringValue; skipping device",
293+
"getDeviceInfo: device %q (driver %q, pool %q) has %q with nil StringValue; skipping allocation result",
270294
device.Name, result.Driver, result.Pool, multusDeviceIDAttr)
271-
continue
295+
return nil, fmt.Errorf("%w: device %q has nil StringValue for %q",
296+
errDeviceNotInAnySlice, device.Name, multusDeviceIDAttr)
272297
}
273298
info := &deviceInfo{DeviceID: *devIDAttr.StringValue}
274299

@@ -283,7 +308,8 @@ func (d *draClient) getDeviceInfo(ctx context.Context, result resourcev1api.Devi
283308
}
284309
}
285310

286-
notFoundErr := fmt.Errorf("device %s not found for claim resource %s/%s in any matching resource slice", result.Device, result.Driver, result.Pool)
311+
notFoundErr := fmt.Errorf("%w: device %s not found for claim resource %s/%s in any matching resource slice",
312+
errDeviceNotInAnySlice, result.Device, result.Driver, result.Pool)
287313
logging.Errorf("getDeviceInfo: %v", notFoundErr)
288314
return nil, notFoundErr
289315
}

pkg/draclient/draclient_test.go

Lines changed: 146 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,7 @@ var _ = Describe("DRA Client operations", func() {
705705
})
706706

707707
Context("when device does not have deviceID attribute", func() {
708-
It("should return an error", func() {
708+
It("should warn and skip the claim without failing", func() {
709709
claimName := "test-claim"
710710
deviceName := "device-1"
711711
driverName := "test-driver.example.com"
@@ -785,13 +785,13 @@ var _ = Describe("DRA Client operations", func() {
785785
// Execute
786786
resourceMap := make(map[string]*types.ResourceInfo)
787787
err = draClient.GetPodResourceMap(pod, resourceMap)
788-
Expect(err).To(HaveOccurred())
789-
Expect(err.Error()).To(ContainSubstring("not found for claim resource"))
788+
Expect(err).NotTo(HaveOccurred())
789+
Expect(resourceMap).To(BeEmpty())
790790
})
791791
})
792792

793793
Context("when device has deviceID but missing resourceName attribute", func() {
794-
It("should return an error", func() {
794+
It("should warn and skip without failing when nothing maps to a resource name", func() {
795795
claimName := "test-claim"
796796
deviceName := "device-1"
797797
driverName := "test-driver.example.com"
@@ -846,13 +846,13 @@ var _ = Describe("DRA Client operations", func() {
846846

847847
resourceMap := make(map[string]*types.ResourceInfo)
848848
err = draClient.GetPodResourceMap(pod, resourceMap)
849-
Expect(err).To(HaveOccurred())
850-
Expect(err.Error()).To(ContainSubstring(multusResourceNameAttr))
849+
Expect(err).NotTo(HaveOccurred())
850+
Expect(resourceMap).To(BeEmpty())
851851
})
852852
})
853853

854854
Context("when device name in allocation does not match any device in slice", func() {
855-
It("should return an error", func() {
855+
It("should warn and skip the claim without failing", func() {
856856
claimName := "test-claim"
857857
deviceName := "device-1"
858858
wrongDeviceName := "wrong-device"
@@ -934,8 +934,145 @@ var _ = Describe("DRA Client operations", func() {
934934
// Execute
935935
resourceMap := make(map[string]*types.ResourceInfo)
936936
err = draClient.GetPodResourceMap(pod, resourceMap)
937-
Expect(err).To(HaveOccurred())
938-
Expect(err.Error()).To(ContainSubstring("not found for claim resource"))
937+
Expect(err).NotTo(HaveOccurred())
938+
Expect(resourceMap).To(BeEmpty())
939+
})
940+
941+
It("should preserve existing kubelet map entries when the claim maps nothing", func() {
942+
claimName := "test-claim"
943+
deviceName := "device-1"
944+
wrongDeviceName := "wrong-device"
945+
driverName := "test-driver.example.com"
946+
poolName := "test-pool"
947+
requestName := "gpu"
948+
deviceID := "pci:0000:00:01.0"
949+
legacyKey := "example.com/legacy-vf"
950+
legacyPCI := "0000:8d:00.4"
951+
952+
deviceIDValue := deviceID
953+
resourceSlice := &resourcev1api.ResourceSlice{
954+
ObjectMeta: metav1.ObjectMeta{Name: "test-resource-slice"},
955+
Spec: resourcev1api.ResourceSliceSpec{
956+
Driver: driverName,
957+
Pool: resourcev1api.ResourcePool{Name: poolName, ResourceSliceCount: 1},
958+
Devices: []resourcev1api.Device{
959+
{
960+
Name: deviceName,
961+
Attributes: map[resourcev1api.QualifiedName]resourcev1api.DeviceAttribute{
962+
multusDeviceIDAttr: {StringValue: &deviceIDValue},
963+
},
964+
},
965+
},
966+
},
967+
}
968+
resourceClaim := &resourcev1api.ResourceClaim{
969+
ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: "default"},
970+
Status: resourcev1api.ResourceClaimStatus{
971+
Allocation: &resourcev1api.AllocationResult{
972+
Devices: resourcev1api.DeviceAllocationResult{
973+
Results: []resourcev1api.DeviceRequestAllocationResult{
974+
{Request: requestName, Driver: driverName, Pool: poolName, Device: wrongDeviceName},
975+
},
976+
},
977+
},
978+
},
979+
}
980+
claimNamePtr := claimName
981+
pod := &v1.Pod{
982+
ObjectMeta: metav1.ObjectMeta{Name: "hybrid-pod", Namespace: "default", UID: k8sTypes.UID("uid-hybrid")},
983+
Status: v1.PodStatus{
984+
ResourceClaimStatuses: []v1.PodResourceClaimStatus{
985+
{Name: claimName, ResourceClaimName: &claimNamePtr},
986+
},
987+
},
988+
}
989+
_, err := fakeClient.ResourceV1().ResourceClaims("default").Create(context.TODO(), resourceClaim, metav1.CreateOptions{})
990+
Expect(err).NotTo(HaveOccurred())
991+
_, err = fakeClient.ResourceV1().ResourceSlices().Create(context.TODO(), resourceSlice, metav1.CreateOptions{})
992+
Expect(err).NotTo(HaveOccurred())
993+
994+
resourceMap := map[string]*types.ResourceInfo{
995+
legacyKey: {DeviceIDs: []string{legacyPCI}},
996+
}
997+
err = draClient.GetPodResourceMap(pod, resourceMap)
998+
Expect(err).NotTo(HaveOccurred())
999+
Expect(resourceMap[legacyKey].DeviceIDs).To(Equal([]string{legacyPCI}))
1000+
})
1001+
1002+
It("should succeed when one allocation result is missing from slices but another resolves", func() {
1003+
claimName := "multi-claim"
1004+
driverSRIOV := "sriovnetwork.k8snetworkplumbingwg.io"
1005+
driverGPU := "gpu.nvidia.com"
1006+
poolName := "orch-dev-a100-002"
1007+
deviceVF := "0000-8d-00-4"
1008+
deviceGPU := "gpu-4"
1009+
deviceIDVF := "pci:0000:8d:00.4"
1010+
mapKey := "nvidia.com/port2"
1011+
deviceIDVFVal := deviceIDVF
1012+
mapKeyVal := mapKey
1013+
1014+
// SR-IOV slice has the VF; empty GPU slice exists so List() finds gpu.nvidia.com/pool (real clusters
1015+
// always publish a slice per driver/pool). Allocation still references gpu-4, which is not listed here.
1016+
resourceSlice := &resourcev1api.ResourceSlice{
1017+
ObjectMeta: metav1.ObjectMeta{Name: "sriov-slice"},
1018+
Spec: resourcev1api.ResourceSliceSpec{
1019+
Driver: driverSRIOV,
1020+
Pool: resourcev1api.ResourcePool{Name: poolName, ResourceSliceCount: 1},
1021+
Devices: []resourcev1api.Device{
1022+
{
1023+
Name: deviceVF,
1024+
Attributes: map[resourcev1api.QualifiedName]resourcev1api.DeviceAttribute{
1025+
multusDeviceIDAttr: {StringValue: &deviceIDVFVal},
1026+
multusResourceNameAttr: {StringValue: &mapKeyVal},
1027+
},
1028+
},
1029+
},
1030+
},
1031+
}
1032+
gpuSlice := &resourcev1api.ResourceSlice{
1033+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-slice"},
1034+
Spec: resourcev1api.ResourceSliceSpec{
1035+
Driver: driverGPU,
1036+
Pool: resourcev1api.ResourcePool{Name: poolName, ResourceSliceCount: 1},
1037+
Devices: []resourcev1api.Device{},
1038+
},
1039+
}
1040+
1041+
resourceClaim := &resourcev1api.ResourceClaim{
1042+
ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: "default"},
1043+
Status: resourcev1api.ResourceClaimStatus{
1044+
Allocation: &resourcev1api.AllocationResult{
1045+
Devices: resourcev1api.DeviceAllocationResult{
1046+
Results: []resourcev1api.DeviceRequestAllocationResult{
1047+
{Request: "vf", Driver: driverSRIOV, Pool: poolName, Device: deviceVF},
1048+
{Request: "gpu", Driver: driverGPU, Pool: poolName, Device: deviceGPU},
1049+
},
1050+
},
1051+
},
1052+
},
1053+
}
1054+
1055+
claimNamePtr := claimName
1056+
pod := &v1.Pod{
1057+
ObjectMeta: metav1.ObjectMeta{Name: "mixed-pod", Namespace: "default", UID: k8sTypes.UID("uid-mixed")},
1058+
Status: v1.PodStatus{
1059+
ResourceClaimStatuses: []v1.PodResourceClaimStatus{
1060+
{Name: claimName, ResourceClaimName: &claimNamePtr},
1061+
},
1062+
},
1063+
}
1064+
1065+
_, err := fakeClient.ResourceV1().ResourceClaims("default").Create(context.TODO(), resourceClaim, metav1.CreateOptions{})
1066+
Expect(err).NotTo(HaveOccurred())
1067+
_, err = fakeClient.ResourceV1().ResourceSlices().Create(context.TODO(), resourceSlice, metav1.CreateOptions{})
1068+
Expect(err).NotTo(HaveOccurred())
1069+
_, err = fakeClient.ResourceV1().ResourceSlices().Create(context.TODO(), gpuSlice, metav1.CreateOptions{})
1070+
Expect(err).NotTo(HaveOccurred())
1071+
1072+
resourceMap := make(map[string]*types.ResourceInfo)
1073+
err = draClient.GetPodResourceMap(pod, resourceMap)
1074+
Expect(err).NotTo(HaveOccurred())
1075+
Expect(resourceMap[mapKey].DeviceIDs).To(Equal([]string{deviceIDVF}))
9391076
})
9401077
})
9411078

0 commit comments

Comments
 (0)