Skip to content

Commit 01bbaa6

Browse files
Addressing review comments
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
1 parent d76351e commit 01bbaa6

File tree

3 files changed

+17
-6
lines changed

3 files changed

+17
-6
lines changed

internal/controller/nimservice_controller.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,10 @@ func (r *NIMServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
134134

135135
logger.Info("Reconciling", "NIMService", nimService.Name)
136136

137-
if nimService.Spec.MultiNode != nil && nimService.Annotations != nil {
137+
if nimService.Spec.MultiNode != nil {
138+
if nimService.Annotations == nil {
139+
nimService.Annotations = map[string]string{}
140+
}
138141
if _, ok := nimService.Annotations[utils.GPUCountPerPodAnnotationKey]; !ok {
139142
gpuCountPerPod, err := shared.GetGPUCountPerPod(ctx, r.GetClient(), nimService)
140143
if err != nil {
@@ -272,6 +275,16 @@ func (r *NIMServiceReconciler) SetupWithManager(mgr ctrl.Manager) error {
272275
return err
273276
}
274277

278+
exists, err := k8sutil.CRDExists(r.discoveryClient, resourcev1beta2.SchemeGroupVersion.WithResource("deviceclasses"))
279+
if err != nil {
280+
return err
281+
}
282+
if exists {
283+
if _, err := mgr.GetCache().GetInformer(context.Background(), &resourcev1beta2.DeviceClass{}); err != nil {
284+
return err
285+
}
286+
}
287+
275288
nimServiceBuilder := ctrl.NewControllerManagedBy(mgr).
276289
For(&appsv1alpha1.NIMService{}).
277290
Owns(&appsv1.Deployment{}).

internal/controller/platform/kserve/nimservice.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ func (r *NIMServiceReconciler) addGPUResources(ctx context.Context, nimService *
578578
// Get tensorParallelism from the profile
579579
tensorParallelism, err := utils.GetTensorParallelismByProfileTags(profile.Config)
580580
if err != nil {
581-
logger.Error(err, "Failed to retrieve tensorParallelism")
581+
logger.Error(err, "Missing nvidia.com/gpu resource request/limit and unable to retrieve tensorParallelism for NIM profile")
582582
return nil, err
583583
}
584584
if tensorParallelism != "" {

internal/shared/resourceclaims.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -331,8 +331,7 @@ func getGPUCountFromDeviceRequests(ctx context.Context, client client.Client, re
331331

332332
isGPU, err := isNVIDIAGPU(ctx, client, req.Exactly.DeviceClassName)
333333
if err != nil {
334-
// This allows partial success scenarios
335-
continue
334+
return 0, fmt.Errorf("failed to check if device class %s is a GPU: %w", req.Exactly.DeviceClassName, err)
336335
}
337336

338337
if isGPU {
@@ -353,8 +352,7 @@ func getGPUCountFromDeviceSpecs(ctx context.Context, client client.Client, devic
353352

354353
isGPU, err := isNVIDIAGPU(ctx, client, dev.DeviceClassName)
355354
if err != nil {
356-
// This allows partial success scenarios
357-
continue
355+
return 0, fmt.Errorf("failed to check if device class %s is a GPU: %w", dev.DeviceClassName, err)
358356
}
359357

360358
if isGPU {

0 commit comments

Comments
 (0)