Cleanup Multi Node API to avoid redundant GPU input

visheshtanksale · visheshtanksale · commit 72a326dd7573 · 2025-08-22T22:10:45.000Z
Signed-off-by: Vishesh Tanksale &lt;vtanksale@nvidia.com&gt;
diff --git a/api/apps/v1alpha1/nimservice_types.go b/api/apps/v1alpha1/nimservice_types.go
@@ -145,10 +145,6 @@ type NimServiceMultiNodeConfig struct {
 	// +kubebuilder:validation:Minimum=1
 	Size int `json:"size,omitempty"`
 
-	// +kubebuilder:default:=1
-	// GPUSPerPod specifies the number of GPUs for each instance. In most cases, this should match `resources.limits.nvidia.com/gpu`.
-	GPUSPerPod int `json:"gpusPerPod,omitempty"`
-
 	// MPI config for NIMService using LeaderWorkerSet
 	MPI *MultiNodeMPIConfig `json:"mpi,omitempty"`
 }
@@ -227,6 +223,16 @@ func (n *NIMService) GetLWSName() string {
 	return fmt.Sprintf("%s-lws", n.GetName())
 }
 
+// GetMultiNodeGPUsPerPod returns the number of GPUs per pod for the multi-node NIMService.
+func (n *NIMService) GetMultiNodeGPUsPerPod() int {
+	gpuQuantity, ok := n.Spec.Resources.Requests["nvidia.com/gpu"]
+	if !ok {
+		// return 0 if no GPU limit is specified because auto determine base on tp*pp/(.spec.multiNode.size) is a TODO
+		return 0
+	}
+	return int(gpuQuantity.Value())
+}
+
 // GetPVCName returns the name to be used for the PVC based on the custom spec
 // Prefers pvc.Name if explicitly set by the user in the NIMService instance.
 func (n *NIMService) GetPVCName(pvc PersistentVolumeClaim) string {
@@ -336,7 +342,7 @@ func (n *NIMService) getLWSCommonEnv() []corev1.EnvVar {
 		},
 		{
 			Name:  "NIM_TENSOR_PARALLEL_SIZE",
-			Value: fmt.Sprintf("%d", n.Spec.MultiNode.GPUSPerPod),
+			Value: fmt.Sprintf("%d", n.GetMultiNodeGPUsPerPod()),
 		},
 		{
 			Name:  "NIM_PIPELINE_PARALLEL_SIZE",
@@ -377,7 +383,7 @@ func (n *NIMService) GetLWSLeaderEnv() []corev1.EnvVar {
 		},
 		{
 			Name:  "GPUS_PER_NODE",
-			Value: fmt.Sprintf("%d", n.Spec.MultiNode.GPUSPerPod),
+			Value: fmt.Sprintf("%d", n.GetMultiNodeGPUsPerPod()),
 		},
 		{
 			Name:  "CLUSTER_START_TIMEOUT",
@@ -1198,10 +1204,10 @@ func (n *NIMService) generateMPIConfigData() map[string]string {
 	// Construct ConfigMap data
 	data := make(map[string]string)
 	for i := 0; i < n.Spec.Replicas; i++ {
-		hostfile := fmt.Sprintf("localhost slots=%d\n", n.Spec.MultiNode.GPUSPerPod)
+		hostfile := fmt.Sprintf("localhost slots=%d\n", n.GetMultiNodeGPUsPerPod())
 		for j := 1; j < n.Spec.MultiNode.Size; j++ {
 			workerHostname := fmt.Sprintf("%s-%d-%d.%s.%s.svc slots=%d",
-				n.GetLWSName(), i, j, n.GetLWSName(), n.GetNamespace(), n.Spec.MultiNode.GPUSPerPod)
+				n.GetLWSName(), i, j, n.GetLWSName(), n.GetNamespace(), n.GetMultiNodeGPUsPerPod())
 			hostfile += workerHostname + "\n"
 		}
 		dataKey := fmt.Sprintf("hostfile-%d", i)
diff --git a/bundle/manifests/apps.nvidia.com_nimpipelines.yaml b/bundle/manifests/apps.nvidia.com_nimpipelines.yaml
@@ -1293,12 +1293,6 @@ spec:
                               enum:
                               - lws
                               type: string
-                            gpusPerPod:
-                              default: 1
-                              description: GPUSPerPod specifies the number of GPUs
-                                for each instance. In most cases, this should match
-                                `resources.limits.nvidia.com/gpu`.
-                              type: integer
                             mpi:
                               description: MPI config for NIMService using LeaderWorkerSet
                               properties:
diff --git a/bundle/manifests/apps.nvidia.com_nimservices.yaml b/bundle/manifests/apps.nvidia.com_nimservices.yaml
@@ -1226,11 +1226,6 @@ spec:
                     enum:
                     - lws
                     type: string
-                  gpusPerPod:
-                    default: 1
-                    description: GPUSPerPod specifies the number of GPUs for each
-                      instance. In most cases, this should match `resources.limits.nvidia.com/gpu`.
-                    type: integer
                   mpi:
                     description: MPI config for NIMService using LeaderWorkerSet
                     properties:
diff --git a/config/crd/bases/apps.nvidia.com_nimpipelines.yaml b/config/crd/bases/apps.nvidia.com_nimpipelines.yaml
@@ -1293,12 +1293,6 @@ spec:
                               enum:
                               - lws
                               type: string
-                            gpusPerPod:
-                              default: 1
-                              description: GPUSPerPod specifies the number of GPUs
-                                for each instance. In most cases, this should match
-                                `resources.limits.nvidia.com/gpu`.
-                              type: integer
                             mpi:
                               description: MPI config for NIMService using LeaderWorkerSet
                               properties:
diff --git a/config/crd/bases/apps.nvidia.com_nimservices.yaml b/config/crd/bases/apps.nvidia.com_nimservices.yaml
@@ -1226,11 +1226,6 @@ spec:
                     enum:
                     - lws
                     type: string
-                  gpusPerPod:
-                    default: 1
-                    description: GPUSPerPod specifies the number of GPUs for each
-                      instance. In most cases, this should match `resources.limits.nvidia.com/gpu`.
-                    type: integer
                   mpi:
                     description: MPI config for NIMService using LeaderWorkerSet
                     properties:
diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml
@@ -1293,12 +1293,6 @@ spec:
                               enum:
                               - lws
                               type: string
-                            gpusPerPod:
-                              default: 1
-                              description: GPUSPerPod specifies the number of GPUs
-                                for each instance. In most cases, this should match
-                                `resources.limits.nvidia.com/gpu`.
-                              type: integer
                             mpi:
                               description: MPI config for NIMService using LeaderWorkerSet
                               properties:
diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml
@@ -1226,11 +1226,6 @@ spec:
                     enum:
                     - lws
                     type: string
-                  gpusPerPod:
-                    default: 1
-                    description: GPUSPerPod specifies the number of GPUs for each
-                      instance. In most cases, this should match `resources.limits.nvidia.com/gpu`.
-                    type: integer
                   mpi:
                     description: MPI config for NIMService using LeaderWorkerSet
                     properties:
diff --git a/internal/controller/platform/kserve/nimservice.go b/internal/controller/platform/kserve/nimservice.go
@@ -577,7 +577,7 @@ func (r *NIMServiceReconciler) addGPUResources(ctx context.Context, nimService *
 	// if deployed as multi-node, use the GPU per worker value to assign GPU resources to each worker
 	// TODO auto determine base on tp*pp/(.spec.multiNode.size)
 	if nimService.Spec.MultiNode != nil {
-		gpuQuantity, err = apiResource.ParseQuantity(fmt.Sprintf("%d", nimService.Spec.MultiNode.GPUSPerPod))
+		gpuQuantity, err = apiResource.ParseQuantity(fmt.Sprintf("%d", nimService.GetMultiNodeGPUsPerPod()))
 		if err != nil {
 			logger.Error(err, "Failed to parse GPU per worker value")
 			return nil, err
diff --git a/internal/controller/platform/kserve/nimservice_test.go b/internal/controller/platform/kserve/nimservice_test.go
@@ -1744,23 +1744,6 @@ var _ = Describe("NIMServiceReconciler for a KServe platform", func() {
 			Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
 		})
 
-		It("should assign GPU resource equal to multiNode.GPUSPerPod in multi-node deployment", func() {
-			nimService.Spec.MultiNode = &appsv1alpha1.NimServiceMultiNodeConfig{
-				GPUSPerPod: 2,
-			}
-			profile := &appsv1alpha1.NIMProfile{
-				Name:   "test-profile",
-				Config: map[string]string{"tp": "4"},
-			}
-
-			resources, err := reconciler.addGPUResources(context.TODO(), nimService, profile)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(resources).ToNot(BeNil())
-
-			Expect(resources.Requests).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
-			Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
-		})
-
 		It("should return an error if tensor parallelism cannot be parsed", func() {
 			profile := &appsv1alpha1.NIMProfile{
 				Name:   "test-profile",
diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go
@@ -1004,7 +1004,7 @@ func (r *NIMServiceReconciler) addGPUResources(ctx context.Context, nimService *
 	// if deployed as multi-node, use the GPU per worker value to assign GPU resources to each worker
 	// TODO auto determine base on tp*pp/(.spec.multiNode.size)
 	if nimService.Spec.MultiNode != nil {
-		gpuQuantity, err = apiResource.ParseQuantity(fmt.Sprintf("%d", nimService.Spec.MultiNode.GPUSPerPod))
+		gpuQuantity, err = apiResource.ParseQuantity(fmt.Sprintf("%d", nimService.GetMultiNodeGPUsPerPod()))
 		if err != nil {
 			logger.Error(err, "Failed to parse GPU per worker value")
 			return nil, err
diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go
@@ -1024,12 +1024,16 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
 					Namespace: "default",
 				},
 				Spec: appsv1alpha1.NIMServiceSpec{
+					Resources: &corev1.ResourceRequirements{
+						Requests: corev1.ResourceList{
+							corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("8"),
+						},
+					},
 					Expose: appsv1alpha1.Expose{
 						Service: appsv1alpha1.Service{Type: corev1.ServiceTypeLoadBalancer, Port: ptr.To[int32](8123), Annotations: map[string]string{"annotation-key-specific": "service"}},
 					},
 					MultiNode: &appsv1alpha1.NimServiceMultiNodeConfig{
-						Size:       2,
-						GPUSPerPod: 8,
+						Size: 2,
 					},
 				},
 			}
@@ -1851,7 +1855,11 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
 					MultiNode: &appsv1alpha1.NimServiceMultiNodeConfig{
 						BackendType: appsv1alpha1.NIMBackendTypeLWS,
 						Size:        2,
-						GPUSPerPod:  2,
+					},
+					Resources: &corev1.ResourceRequirements{
+						Requests: corev1.ResourceList{
+							corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"),
+						},
 					},
 				},
 			}
@@ -2074,7 +2082,11 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
 					MultiNode: &appsv1alpha1.NimServiceMultiNodeConfig{
 						BackendType: appsv1alpha1.NIMBackendTypeLWS,
 						Size:        2,
-						GPUSPerPod:  2,
+					},
+					Resources: &corev1.ResourceRequirements{
+						Requests: corev1.ResourceList{
+							corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"),
+						},
 					},
 				},
 			}
@@ -2358,23 +2370,6 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
 			Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
 		})
 
-		It("should assign GPU resource equal to multiNode.GPUSPerPod in multi-node deployment", func() {
-			nimService.Spec.MultiNode = &appsv1alpha1.NimServiceMultiNodeConfig{
-				GPUSPerPod: 2,
-			}
-			profile := &appsv1alpha1.NIMProfile{
-				Name:   "test-profile",
-				Config: map[string]string{"tp": "4"},
-			}
-
-			resources, err := reconciler.addGPUResources(context.TODO(), nimService, profile)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(resources).ToNot(BeNil())
-
-			Expect(resources.Requests).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
-			Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
-		})
-
 		It("should return an error if tensor parallelism cannot be parsed", func() {
 			profile := &appsv1alpha1.NIMProfile{
 				Name:   "test-profile",
diff --git a/internal/webhook/apps/v1alpha1/nimservice_webhook_validation_helper.go b/internal/webhook/apps/v1alpha1/nimservice_webhook_validation_helper.go
@@ -69,10 +69,17 @@ func validateNIMServiceSpec(spec *appsv1alpha1.NIMServiceSpec, fldPath *field.Pa
 	errList = append(errList, validateExposeConfiguration(&spec.Expose, fldPath.Child("expose").Child("ingress"))...)
 	errList = append(errList, validateMetricsConfiguration(&spec.Metrics, fldPath.Child("metrics"))...)
 	errList = append(errList, validateScaleConfiguration(&spec.Scale, fldPath.Child("scale"))...)
-	errList = append(errList, validateResourcesConfiguration(spec.Resources, fldPath.Child("resources"))...)
+	errList = append(errList, validateResourcesConfiguration(spec, fldPath.Child("resources"))...)
 	errList = append(errList, validateDRAResourcesConfiguration(spec, fldPath, kubeVersion)...)
 	errList = append(errList, validateKServeConfiguration(spec, fldPath)...)
 
+	// Validate MultiNode configuration
+	if spec.MultiNode != nil {
+		if spec.Resources != nil && spec.Resources.Limits != nil {
+			errList = append(errList, field.Invalid(fldPath.Child("multiNode").Child("size"), spec.MultiNode.Size, "must be > 0"))
+		}
+	}
+
 	return errList
 }
 
@@ -386,13 +393,45 @@ func validateScaleConfiguration(scale *appsv1alpha1.Autoscaling, fldPath *field.
 }
 
 // Spec.Resources.Claims must be empty.
-func validateResourcesConfiguration(resources *corev1.ResourceRequirements, fldPath *field.Path) field.ErrorList {
+func validateResourcesConfiguration(spec *appsv1alpha1.NIMServiceSpec, fldPath *field.Path) field.ErrorList {
 	errList := field.ErrorList{}
-	if resources != nil {
-		if resources.Claims != nil || len(resources.Claims) != 0 {
-			errList = append(errList, field.Forbidden(fldPath.Child("claims"), "must be empty"))
-		}
+
+	// Validate that Claims must be empty
+	if spec.Resources != nil && spec.Resources.Claims != nil && len(spec.Resources.Claims) > 0 {
+		errList = append(errList, field.Forbidden(fldPath.Child("claims"), "must be empty"))
+	}
+
+	// Validate GPU requirements when MultiNode is enabled
+	if spec.MultiNode != nil {
+		errList = append(errList, validateGPURequirements(spec, fldPath)...)
+	}
+
+	return errList
+}
+
+// validateGPURequirements ensures that GPU resources are properly configured for MultiNode deployments.
+func validateGPURequirements(spec *appsv1alpha1.NIMServiceSpec, fldPath *field.Path) field.ErrorList {
+	errList := field.ErrorList{}
+
+	gpuResourceName := corev1.ResourceName("nvidia.com/gpu")
+
+	// Check if GPU requests are specified
+	if spec.Resources == nil || spec.Resources.Requests == nil {
+		errList = append(errList, field.Required(fldPath.Child("requests"), "GPU requests must be specified for MultiNode deployments"))
+		return errList
 	}
+
+	gpuQuantity, exists := spec.Resources.Requests[gpuResourceName]
+	if !exists {
+		errList = append(errList, field.Required(fldPath.Child("requests").Child("nvidia.com/gpu"), "nvidia.com/gpu must be specified in requests for MultiNode deployments"))
+		return errList
+	}
+
+	// Validate that GPU quantity is positive
+	if gpuQuantity.IsZero() || gpuQuantity.Value() <= 0 {
+		errList = append(errList, field.Invalid(fldPath.Child("requests").Child("nvidia.com/gpu"), gpuQuantity.String(), "must be greater than 0"))
+	}
+
 	return errList
 }
 
diff --git a/internal/webhook/apps/v1alpha1/nimservice_webhook_validation_helper_test.go b/internal/webhook/apps/v1alpha1/nimservice_webhook_validation_helper_test.go
@@ -937,7 +937,7 @@ func TestValidateResourcesConfiguration(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
 			ns := baseNIMService()
 			tc.modify(ns)
-			errs := validateResourcesConfiguration(ns.Spec.Resources, fld)
+			errs := validateResourcesConfiguration(&ns.Spec, fld)
 			if got := len(errs); got != tc.wantErrs {
 				t.Logf("Validation errors:")
 				for i, err := range errs {