@@ -41,9 +41,17 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in
4141 return empty {}, nil
4242 }
4343
44+ // Check if device is healthy before querying
45+ if err := resource .CheckDeviceHealth (device ); err != nil {
46+ klog .Warningf ("Skipping unhealthy device in GPU resource labeler: %v" , err )
47+ return empty {}, nil
48+ }
49+
4450 model , err := device .GetName ()
4551 if err != nil {
46- return nil , fmt .Errorf ("failed to get device model: %v" , err )
52+ // If health check passed but GetName fails, log and skip
53+ klog .Warningf ("Device health check passed but GetName failed: %v" , err )
54+ return empty {}, nil
4755 }
4856
4957 totalMemoryMiB , err := device .GetTotalMemoryMiB ()
@@ -55,7 +63,9 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in
5563
5664 architectureLabels , err := newArchitectureLabels (resourceLabeler , device )
5765 if err != nil {
58- return nil , fmt .Errorf ("failed to create architecture labels: %v" , err )
66+ // Don't crash on architecture label failure, log and continue
67+ klog .Warningf ("Failed to create architecture labels: %v" , err )
68+ architectureLabels = make (Labels )
5969 }
6070
6171 memoryLabeler := (Labeler )(& empty {})
@@ -78,25 +88,35 @@ func NewMIGResourceLabeler(resourceName spec.ResourceName, config *spec.Config,
7888 return empty {}, nil
7989 }
8090
91+ // Check if device is healthy before querying
92+ if err := resource .CheckDeviceHealth (device ); err != nil {
93+ klog .Warningf ("Skipping unhealthy MIG device in resource labeler: %v" , err )
94+ return empty {}, nil
95+ }
96+
8197 parent , err := device .GetDeviceHandleFromMigDeviceHandle ()
8298 if err != nil {
83- return nil , fmt .Errorf ("failed to get parent of MIG device: %v" , err )
99+ klog .Warningf ("Failed to get parent of MIG device, skipping: %v" , err )
100+ return empty {}, nil
84101 }
85102 model , err := parent .GetName ()
86103 if err != nil {
87- return nil , fmt .Errorf ("failed to get device model: %v" , err )
104+ klog .Warningf ("Failed to get device model, skipping: %v" , err )
105+ return empty {}, nil
88106 }
89107
90108 migProfile , err := device .GetName ()
91109 if err != nil {
92- return nil , fmt .Errorf ("failed to get MIG profile name: %v" , err )
110+ klog .Warningf ("Failed to get MIG profile name, skipping: %v" , err )
111+ return empty {}, nil
93112 }
94113
95114 resourceLabeler := newResourceLabeler (resourceName , config )
96115
97116 attributeLabels , err := newMigAttributeLabels (resourceLabeler , device )
98117 if err != nil {
99- return nil , fmt .Errorf ("faled to get MIG attribute labels: %v" , err )
118+ klog .Warningf ("Failed to get MIG attribute labels: %v" , err )
119+ attributeLabels = make (Labels )
100120 }
101121
102122 labelers := Merge (
@@ -252,7 +272,9 @@ func (rl resourceLabeler) replicationInfo() *spec.ReplicatedResource {
252272func newMigAttributeLabels (rl resourceLabeler , device resource.Device ) (Labels , error ) {
253273 attributes , err := device .GetAttributes ()
254274 if err != nil {
255- return nil , fmt .Errorf ("unable to get attributes of MIG device: %v" , err )
275+ // Return empty labels instead of crashing
276+ klog .Warningf ("Unable to get attributes of MIG device, skipping: %v" , err )
277+ return make (Labels ), nil
256278 }
257279
258280 labels := rl .labels (attributes )
@@ -263,7 +285,9 @@ func newMigAttributeLabels(rl resourceLabeler, device resource.Device) (Labels,
263285func newArchitectureLabels (rl resourceLabeler , device resource.Device ) (Labels , error ) {
264286 computeMajor , computeMinor , err := device .GetCudaComputeCapability ()
265287 if err != nil {
266- return nil , fmt .Errorf ("failed to determine CUDA compute capability: %v" , err )
288+ // Return empty labels instead of error - allows labeling to continue
289+ klog .Warningf ("Failed to determine CUDA compute capability, skipping architecture labels: %v" , err )
290+ return make (Labels ), nil
267291 }
268292
269293 if computeMajor == 0 {
0 commit comments