@@ -145,10 +145,6 @@ type NimServiceMultiNodeConfig struct {
145145 // +kubebuilder:validation:Minimum=1
146146 Size int `json:"size,omitempty"`
147147
148- // +kubebuilder:default:=1
149- // GPUSPerPod specifies the number of GPUs for each instance. In most cases, this should match `resources.limits.nvidia.com/gpu`.
150- GPUSPerPod int `json:"gpusPerPod,omitempty"`
151-
152148 // MPI config for NIMService using LeaderWorkerSet
153149 MPI * MultiNodeMPIConfig `json:"mpi,omitempty"`
154150}
@@ -227,6 +223,16 @@ func (n *NIMService) GetLWSName() string {
227223 return fmt .Sprintf ("%s-lws" , n .GetName ())
228224}
229225
226+ // GetMultiNodeGPUsPerPod returns the number of GPUs per pod for the multi-node NIMService.
227+ func (n * NIMService ) GetMultiNodeGPUsPerPod () int {
228+ gpuQuantity , ok := n .Spec .Resources .Requests ["nvidia.com/gpu" ]
229+ if ! ok {
230+ // return 0 if no GPU limit is specified because auto determine base on tp*pp/(.spec.multiNode.size) is a TODO
231+ return 0
232+ }
233+ return int (gpuQuantity .Value ())
234+ }
235+
230236// GetPVCName returns the name to be used for the PVC based on the custom spec
231237// Prefers pvc.Name if explicitly set by the user in the NIMService instance.
232238func (n * NIMService ) GetPVCName (pvc PersistentVolumeClaim ) string {
@@ -336,7 +342,7 @@ func (n *NIMService) getLWSCommonEnv() []corev1.EnvVar {
336342 },
337343 {
338344 Name : "NIM_TENSOR_PARALLEL_SIZE" ,
339- Value : fmt .Sprintf ("%d" , n .Spec . MultiNode . GPUSPerPod ),
345+ Value : fmt .Sprintf ("%d" , n .GetMultiNodeGPUsPerPod () ),
340346 },
341347 {
342348 Name : "NIM_PIPELINE_PARALLEL_SIZE" ,
@@ -377,7 +383,7 @@ func (n *NIMService) GetLWSLeaderEnv() []corev1.EnvVar {
377383 },
378384 {
379385 Name : "GPUS_PER_NODE" ,
380- Value : fmt .Sprintf ("%d" , n .Spec . MultiNode . GPUSPerPod ),
386+ Value : fmt .Sprintf ("%d" , n .GetMultiNodeGPUsPerPod () ),
381387 },
382388 {
383389 Name : "CLUSTER_START_TIMEOUT" ,
@@ -1198,10 +1204,10 @@ func (n *NIMService) generateMPIConfigData() map[string]string {
11981204 // Construct ConfigMap data
11991205 data := make (map [string ]string )
12001206 for i := 0 ; i < n .Spec .Replicas ; i ++ {
1201- hostfile := fmt .Sprintf ("localhost slots=%d\n " , n .Spec . MultiNode . GPUSPerPod )
1207+ hostfile := fmt .Sprintf ("localhost slots=%d\n " , n .GetMultiNodeGPUsPerPod () )
12021208 for j := 1 ; j < n .Spec .MultiNode .Size ; j ++ {
12031209 workerHostname := fmt .Sprintf ("%s-%d-%d.%s.%s.svc slots=%d" ,
1204- n .GetLWSName (), i , j , n .GetLWSName (), n .GetNamespace (), n .Spec . MultiNode . GPUSPerPod )
1210+ n .GetLWSName (), i , j , n .GetLWSName (), n .GetNamespace (), n .GetMultiNodeGPUsPerPod () )
12051211 hostfile += workerHostname + "\n "
12061212 }
12071213 dataKey := fmt .Sprintf ("hostfile-%d" , i )
0 commit comments