Cleanup Multi Node API to avoid redundant GPU input#631
Cleanup Multi Node API to avoid redundant GPU input#631visheshtanksale wants to merge 19 commits intoNVIDIA:mainfrom
Conversation
71c6b62 to
72a326d
Compare
|
|
||
| // GetMultiNodeGPUsPerPod returns the number of GPUs per pod for the multi-node NIMService. | ||
| func (n *NIMService) GetMultiNodeGPUsPerPod() int { | ||
| gpuQuantity, ok := n.Spec.Resources.Requests["nvidia.com/gpu"] |
There was a problem hiding this comment.
do we need to support DRA here?
There was a problem hiding this comment.
Added DRA support. I am assuming for ResourceClaim and ResourceClaimTemplate that there might be only one request for GPUs under Spec.Devices.Requests. Is this correct assumption?
internal/shared/resourceclaims.go
Outdated
| // Constants for GPU device detection. | ||
| const ( | ||
| // NVIDIA identifiers used to detect GPU devices. | ||
| NVIDIAIdentifier = "nvidia" |
There was a problem hiding this comment.
These might be used for non-gpu devices too for e.g. rdma.nvidia.com. We should just check for a well known device-class gpu.nvidia.com or if custom ones are set, then make sure the Cel expression exist in those classes
expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'"
There was a problem hiding this comment.
Fixed this to look for gpu.nvidia.com
| switch { | ||
| case resource.ResourceClaimName != nil: | ||
| gpuCount, err = GetGPUDeviceCountForClaim(ctx, client, *resource.ResourceClaimName, namespace) | ||
| case resource.ResourceClaimTemplateName != nil: | ||
| gpuCount, err = GetGPUDeviceCountForClaimTemplate(ctx, client, *resource.ResourceClaimTemplateName, namespace) | ||
| case resource.ClaimCreationSpec != nil: | ||
| gpuCount, err = GetGPUDeviceCountForClaimCreationSpec(ctx, client, *resource.ClaimCreationSpec, namespace) | ||
| } |
There was a problem hiding this comment.
I am wondering do we have a use case for users to specify two of (resourceClaimName, resoruceClaimTemplateName, claimCreationSpec)...?
I thought users would only specify one of three?
If that's the case, we should consider adding to the validation webhook
There was a problem hiding this comment.
@shengnuo this is possible across the spec.draResources[] array. 1 entry may have a resourceClaimName while another one may have a claimCreationSpec. We would need to aggregate them
There was a problem hiding this comment.
Its possible for users to specify all three resource specification types
| // TODO auto determine base on tp*pp/(.spec.multiNode.size) | ||
| if nimService.Spec.MultiNode != nil { | ||
| gpuQuantity, err = apiResource.ParseQuantity(fmt.Sprintf("%d", nimService.Spec.MultiNode.GPUSPerPod)) | ||
| gpuQuantity, err = apiResource.ParseQuantity(fmt.Sprintf("%d", multiNodeGPUsPerPod)) |
There was a problem hiding this comment.
I'm confused why this change is needed. multiNodeGPUsPerPod is computed based on the count from nvidia.com/gpu resource requests/limits. It doesnt make sense to do it here which only executes when resource requests/limits is missing.
There was a problem hiding this comment.
Removed this
internal/shared/multinode.go
Outdated
| // GetGPUCountPerPod returns the number of GPUs per pod for the NIMService. | ||
| func GetGPUCountPerPod(ctx context.Context, client client.Client, nimService *appsv1alpha1.NIMService) (int, error) { | ||
|
|
||
| if nimService.Spec.DRAResources == nil { |
There was a problem hiding this comment.
nit
| if nimService.Spec.DRAResources == nil { | |
| if len(nimService.Spec.DRAResources) == 0 { |
internal/shared/multinode.go
Outdated
|
|
||
| if nimService.Spec.DRAResources == nil { | ||
| if nimService.Spec.Resources == nil { | ||
| return 0, nil |
There was a problem hiding this comment.
we should return an error when no GPUs are specified and fail the NIMService for multiNode.
| gpuQuantity, ok := nimService.Spec.Resources.Requests["nvidia.com/gpu"] | ||
| if !ok { | ||
| // return 0 if no GPU limit is specified because auto determine base on tp*pp/(.spec.multiNode.size) is a TODO | ||
| return 0, nil | ||
| } |
There was a problem hiding this comment.
can you also check for resource limits here?
internal/shared/resourceclaims.go
Outdated
| } | ||
|
|
||
| hint := strings.ToLower(fmt.Sprint(dc.Spec)) | ||
| return strings.Contains(hint, NVIDIAIdentifier) || strings.Contains(hint, NVIDIAComIdentifier), nil |
There was a problem hiding this comment.
nit: we should restrict to gpu.nvidia.com device class.
| switch { | ||
| case resource.ResourceClaimName != nil: | ||
| gpuCount, err = GetGPUDeviceCountForClaim(ctx, client, *resource.ResourceClaimName, namespace) | ||
| case resource.ResourceClaimTemplateName != nil: | ||
| gpuCount, err = GetGPUDeviceCountForClaimTemplate(ctx, client, *resource.ResourceClaimTemplateName, namespace) | ||
| case resource.ClaimCreationSpec != nil: | ||
| gpuCount, err = GetGPUDeviceCountForClaimCreationSpec(ctx, client, *resource.ClaimCreationSpec, namespace) | ||
| } |
There was a problem hiding this comment.
@shengnuo this is possible across the spec.draResources[] array. 1 entry may have a resourceClaimName while another one may have a claimCreationSpec. We would need to aggregate them
| } | ||
|
|
||
| // Validate that Claims must be empty | ||
| if spec.Resources != nil && spec.Resources.Claims != nil && len(spec.Resources.Claims) > 0 { |
There was a problem hiding this comment.
nit: if its nil, then len(...) will return 0
| if spec.Resources != nil && spec.Resources.Claims != nil && len(spec.Resources.Claims) > 0 { | |
| if spec.Resources != nil && len(spec.Resources.Claims) > 0 { |
| gpuResourceName := corev1.ResourceName("nvidia.com/gpu") | ||
|
|
||
| // Check if GPU requests are specified | ||
| if spec.Resources == nil || spec.Resources.Requests == nil { |
There was a problem hiding this comment.
nit: also check for resource limits
| errList = append(errList, validateMetricsConfiguration(&spec.Metrics, fldPath.Child("metrics"))...) | ||
| errList = append(errList, validateScaleConfiguration(&spec.Scale, fldPath.Child("scale"))...) | ||
| errList = append(errList, validateResourcesConfiguration(spec.Resources, fldPath.Child("resources"))...) | ||
| errList = append(errList, validateResourcesConfiguration(spec, fldPath.Child("resources"))...) |
There was a problem hiding this comment.
| errList = append(errList, validateResourcesConfiguration(spec, fldPath.Child("resources"))...) | |
| errList = append(errList, validateResourcesConfiguration(spec, fldPath)...) |
nit: its better to match the fieldPath with the input in the caller.
|
|
||
| logger.Info("Reconciling", "NIMService", nimService.Name) | ||
|
|
||
| if nimService.Spec.MultiNode != nil && nimService.Annotations != nil { |
There was a problem hiding this comment.
annotations could be nil initially, we would need to initialize the map for the first time.
There was a problem hiding this comment.
also, this should be done in the platform specific reconciler where we render the LWS resource. That way we can consider the case of auto GPU assignment too based on the tp size in the optimized profile, otherwise there would be mismatch and we end up setting this annotation to 0.
There was a problem hiding this comment.
Updated the annotation nil check.
Auto assign of GPU is currently not supported for Multi Node. We need to discuss this scenario.
| // Get tensorParallelism from the profile | ||
| tensorParallelism, err := utils.GetTensorParallelismByProfileTags(profile.Config) | ||
| if err != nil { | ||
| logger.Error(err, "Failed to retrieve tensorParallelism") |
There was a problem hiding this comment.
| logger.Error(err, "Failed to retrieve tensorParallelism") | |
| logger.Error(err, "Missing nvidia.com/gpu resource request and unable to retrieve tensorParallelism for NIM profile") |
internal/shared/resourceclaims.go
Outdated
|
|
||
| isGPU, err := isNVIDIAGPU(ctx, client, req.Exactly.DeviceClassName) | ||
| if err != nil { | ||
| // This allows partial success scenarios |
There was a problem hiding this comment.
we would need to fail in all cases here as GPU count might be incorrect, if there is an error getting the device-class for e.g.
| _, hasRequests := spec.Resources.Requests[gpuResourceName] | ||
| _, hasLimits := spec.Resources.Limits[gpuResourceName] | ||
|
|
||
| if !hasRequests && !hasLimits { |
There was a problem hiding this comment.
this will fail when an optimized profile is selected and we auto-retrieve GPUs based on the tp size.
There was a problem hiding this comment.
This check is only for multi node NIM Service. Currently, we dont have support for multi node NIMService to assign GPU Resources automatically here. Since I have removed the .spec.multiNode.gpuPerWorker, user has to provide GPU resource limits for multi node
There was a problem hiding this comment.
but we auto assign based on tp size for optimized profile as well with changes in the assignGPUResources call.
| // If no user-provided GPU resource is found, proceed with auto-assignment | ||
| // Get tensorParallelism from the profile | ||
| tensorParallelism, err := utils.GetTensorParallelismByProfileTags(profile.Config) | ||
| if err != nil { | ||
| logger.Error(err, "Missing nvidia.com/gpu resource request/limit and unable to retrieve tensorParallelism for NIM profile") | ||
| return nil, err | ||
| } | ||
| if tensorParallelism != "" { | ||
| gpuQuantity, err = apiResource.ParseQuantity(tensorParallelism) |
There was a problem hiding this comment.
the addGPUResources function look exactly the same between standadlone and kserve controllers...
Consider adding it to internal/controller/shared
There was a problem hiding this comment.
this can be done as a follow up as this applies to many functions between those two packages.
| gpuResourceName := corev1.ResourceName("nvidia.com/gpu") | ||
|
|
||
| // Check if GPU requests or limits are specified | ||
| if spec.Resources == nil { |
There was a problem hiding this comment.
This should also consider spec.DRAResources
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
155695b to
1f53369
Compare
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
| return errList | ||
| } | ||
|
|
||
| if spec.Resources != nil { |
There was a problem hiding this comment.
@visheshtanksale This could just include CPU/Memory requests, while spec.draResources can include GPU requests. First we need to check if draResources are specified then rely on spec.resources.
There was a problem hiding this comment.
This check is primary on spec.resouces and we dont have any checks under spec.draResources because for that we need to make an API request. Since this includes only checks under spec.resources I feel it better to have the implementation here
|
|
||
| // +kubebuilder:default:=1 | ||
| // Size specifies the number of pods to create for the multi-node NIMService. | ||
| // PipelineParallelism specifies the number of pods to create for the multi-node NIMService. |
There was a problem hiding this comment.
i am inclined to add a parallelism type for this if we need to add additional methods in the future.
parallelism:
pp: 1 (default/minimum)
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
| if nimService.Annotations == nil { | ||
| nimService.Annotations = map[string]string{} | ||
| } | ||
| if _, ok := nimService.Annotations[utils.GPUCountPerPodAnnotationKey]; !ok { |
There was a problem hiding this comment.
nit: can you use GetGPUCountPerPod here? its already reading that annotation value
| // GPUSPerPod specifies the number of GPUs for each instance. In most cases, this should match `resources.limits.nvidia.com/gpu`. | ||
| GPUSPerPod int `json:"gpusPerPod,omitempty"` | ||
| // Parallelism specifies the parallelism strategy for the multi-node NIMService. | ||
| Parallelism Parallelism `json:"parallelism,omitempty"` |
There was a problem hiding this comment.
| Parallelism Parallelism `json:"parallelism,omitempty"` | |
| Parallelism *Parallelism `json:"parallelism"` |
I'd expect this to be a required field
| // +kubebuilder:default:=1 | ||
| // PP specifies the number of pods to create for the multi-node NIMService. | ||
| // +kubebuilder:validation:Minimum=1 | ||
| PP int `json:"pp,omitempty"` |
There was a problem hiding this comment.
| PP int `json:"pp,omitempty"` | |
| Pipeline *uint32 `json:"pp,omitempty"` |
can you call this Pipeline? PP doesnt sound good
| } | ||
|
|
||
| type Parallelism struct { | ||
| // +kubebuilder:default:=1 |
There was a problem hiding this comment.
we shouldnt have defaults here. This wouldnt work for multi-node anyway
| } | ||
| } | ||
| return int(gpuQuantity.Value()), nil | ||
| } else { |
There was a problem hiding this comment.
style nit: you dont need an else block here.
| hint := strings.ToLower(fmt.Sprint(dc.Spec)) | ||
| return strings.Contains(hint, NVIDIAGPUComIdentifier), nil |
There was a problem hiding this comment.
this is kinda flaky because it may just contain the term anywhere in the spec. Its better to be safe and iterating over all the selectors and looking only at the expressions.
As a follow-up we can also be careful in only checking if it contains device.driver == "gpu.nvidia.com" OR has device.attributes["gpu.nvidia.com"] OR has device.capacity["gpu.nvidia.com"]
|
|
||
| // Validate that Claims must be empty | ||
| if spec.Resources != nil && len(spec.Resources.Claims) > 0 { | ||
| errList = append(errList, field.Forbidden(fldPath.Child("claims"), "must be empty")) |
There was a problem hiding this comment.
| errList = append(errList, field.Forbidden(fldPath.Child("claims"), "must be empty")) | |
| errList = append(errList, field.Forbidden(fldPath.Child("resources").Child("claims"), "must be empty")) |
| } | ||
|
|
||
| // validateGPURequirements ensures that GPU resources are properly configured for MultiNode deployments. | ||
| func validateGPURequirements(spec *appsv1alpha1.NIMServiceSpec, fldPath *field.Path) field.ErrorList { |
There was a problem hiding this comment.
nit: call this validateMultiNodeGPURequirements?
| if hasRequests { | ||
| gpuRequests := spec.Resources.Requests[gpuResourceName] | ||
| if gpuRequests.IsZero() || gpuRequests.Value() <= 0 { | ||
| errList = append(errList, field.Invalid(fldPath.Child("requests").Child("nvidia.com/gpu"), gpuRequests.String(), "must be greater than 0")) |
There was a problem hiding this comment.
| errList = append(errList, field.Invalid(fldPath.Child("requests").Child("nvidia.com/gpu"), gpuRequests.String(), "must be greater than 0")) | |
| errList = append(errList, field.Invalid(fldPath.Child("requests").Key("nvidia.com/gpu"), gpuRequests.String(), "must be greater than 0")) |
| if hasLimits { | ||
| gpuLimits := spec.Resources.Limits[gpuResourceName] | ||
| if gpuLimits.IsZero() || gpuLimits.Value() <= 0 { | ||
| errList = append(errList, field.Invalid(fldPath.Child("limits").Child("nvidia.com/gpu"), gpuLimits.String(), "must be greater than 0")) |
There was a problem hiding this comment.
| errList = append(errList, field.Invalid(fldPath.Child("limits").Child("nvidia.com/gpu"), gpuLimits.String(), "must be greater than 0")) | |
| errList = append(errList, field.Invalid(fldPath.Child("limits").Key("nvidia.com/gpu"), gpuLimits.String(), "must be greater than 0")) |
|
can we close this as the API change was merged. We need a follow up on adding validations in the webhook. |
|
Closing this. Follow with webhook validations for next release |
Pull request was closed
spec.multiNode.GPUSPerPodspec.Resourcesspec.Resourcesspec.Resourcesthen need to auto determine base on tp*pp/(.spec.multiNode.size)