-
Notifications
You must be signed in to change notification settings - Fork 35
Auto-MNNVL: Update PCS's webhook support auto-mnnvl #370
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| // /* | ||
| // Copyright 2026 The Grove Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
| // */ | ||
|
|
||
| package constants | ||
|
|
||
| import ( | ||
| corev1 "k8s.io/api/core/v1" | ||
| ) | ||
|
|
||
| // GPU resource constants | ||
| const ( | ||
| // GPUResourceName is the resource name for NVIDIA GPUs. It is used to determine | ||
| // if a pod requests NVIDIA GPUs. | ||
| GPUResourceName corev1.ResourceName = "nvidia.com/gpu" | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| // /* | ||
| // Copyright 2025 The Grove Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
| // */ | ||
|
|
||
| package mnnvl | ||
|
|
||
| import ( | ||
| grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" | ||
| "github.com/ai-dynamo/grove/operator/internal/constants" | ||
|
|
||
| corev1 "k8s.io/api/core/v1" | ||
| ) | ||
|
|
||
| // hasGPURequirement checks if any container in any clique of the PCS requests nvidia.com/gpu. | ||
| func hasGPURequirement(pcs *grovecorev1alpha1.PodCliqueSet) bool { | ||
| for _, clique := range pcs.Spec.Template.Cliques { | ||
| if clique == nil { | ||
| continue | ||
| } | ||
| if hasGPUInContainers(clique.Spec.PodSpec.Containers) { | ||
| return true | ||
| } | ||
| if hasGPUInContainers(clique.Spec.PodSpec.InitContainers) { | ||
| return true | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| // hasGPUInContainers checks if any container in the slice requests GPU resources. | ||
| func hasGPUInContainers(containers []corev1.Container) bool { | ||
| for _, container := range containers { | ||
| // Check limits | ||
| if quantity, exists := container.Resources.Limits[constants.GPUResourceName]; exists { | ||
| if !quantity.IsZero() { | ||
| return true | ||
| } | ||
| } | ||
| // Check requests | ||
| if quantity, exists := container.Resources.Requests[constants.GPUResourceName]; exists { | ||
| if !quantity.IsZero() { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| // getAnnotationValue safely retrieves an annotation value from a PCS. | ||
| func getAnnotationValue(pcs *grovecorev1alpha1.PodCliqueSet, key string) (string, bool) { | ||
| if pcs.Annotations == nil { | ||
| return "", false | ||
| } | ||
| value, exists := pcs.Annotations[key] | ||
| return value, exists | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,154 @@ | ||
| // /* | ||
| // Copyright 2025 The Grove Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
| // */ | ||
|
|
||
| package mnnvl | ||
|
|
||
| import ( | ||
| "testing" | ||
|
|
||
| grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" | ||
| "github.com/ai-dynamo/grove/operator/internal/constants" | ||
| testutils "github.com/ai-dynamo/grove/operator/test/utils" | ||
|
|
||
| "github.com/stretchr/testify/assert" | ||
| corev1 "k8s.io/api/core/v1" | ||
| "k8s.io/apimachinery/pkg/api/resource" | ||
| ) | ||
|
|
||
| func Test_hasGPURequirement(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| pcs *grovecorev1alpha1.PodCliqueSet | ||
| expected bool | ||
| }{ | ||
| { | ||
| name: "container with GPU limits", | ||
| pcs: createPCSWithGPU(nil), | ||
| expected: true, | ||
| }, | ||
| { | ||
| name: "container without GPU", | ||
| pcs: createPCSWithoutGPU(nil), | ||
| expected: false, | ||
| }, | ||
| { | ||
| name: "empty cliques", | ||
| pcs: &grovecorev1alpha1.PodCliqueSet{}, | ||
| expected: false, | ||
| }, | ||
| { | ||
| name: "GPU in init container", | ||
| pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", ""). | ||
| WithPodCliqueTemplateSpec( | ||
| testutils.NewPodCliqueTemplateSpecBuilder("worker"). | ||
| WithInitContainer(corev1.Container{ | ||
| Name: "init", | ||
| Resources: corev1.ResourceRequirements{ | ||
| Limits: corev1.ResourceList{ | ||
| constants.GPUResourceName: resource.MustParse("1"), | ||
| }, | ||
| }, | ||
| }). | ||
| Build(), | ||
| ). | ||
| Build(), | ||
| expected: true, | ||
| }, | ||
| { | ||
| name: "GPU in requests not limits", | ||
| pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", ""). | ||
| WithPodCliqueTemplateSpec( | ||
| testutils.NewPodCliqueTemplateSpecBuilder("worker"). | ||
| WithContainer(corev1.Container{ | ||
| Name: "train", | ||
| Resources: corev1.ResourceRequirements{ | ||
| Requests: corev1.ResourceList{ | ||
| constants.GPUResourceName: resource.MustParse("2"), | ||
| }, | ||
| }, | ||
| }). | ||
| Build(), | ||
| ). | ||
| Build(), | ||
| expected: true, | ||
| }, | ||
| { | ||
| name: "GPU with zero quantity", | ||
| pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", ""). | ||
| WithPodCliqueTemplateSpec( | ||
| testutils.NewPodCliqueTemplateSpecBuilder("worker"). | ||
| WithContainer(corev1.Container{ | ||
| Name: "train", | ||
| Resources: corev1.ResourceRequirements{ | ||
| Limits: corev1.ResourceList{ | ||
| constants.GPUResourceName: resource.MustParse("0"), | ||
| }, | ||
| }, | ||
| }). | ||
| Build(), | ||
| ). | ||
| Build(), | ||
| expected: false, | ||
| }, | ||
| { | ||
| name: "multiple cliques - one with GPU", | ||
| pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", ""). | ||
| WithPodCliqueTemplateSpec( | ||
| testutils.NewPodCliqueTemplateSpecBuilder("controller"). | ||
| WithContainer(testutils.NewContainer("ctrl", "busybox")). | ||
| Build(), | ||
| ). | ||
| WithPodCliqueTemplateSpec( | ||
| testutils.NewPodCliqueTemplateSpecBuilder("worker"). | ||
| WithContainer(testutils.NewGPUContainer("train", "nvidia/cuda:latest", 8)). | ||
| Build(), | ||
| ). | ||
| Build(), | ||
| expected: true, | ||
| }, | ||
| } | ||
|
|
||
| for _, test := range tests { | ||
| t.Run(test.name, func(t *testing.T) { | ||
| result := hasGPURequirement(test.pcs) | ||
| assert.Equal(t, test.expected, result) | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // createPCSWithGPU creates a PCS with GPU using the builder for tests in this package. | ||
| func createPCSWithGPU(annotations map[string]string) *grovecorev1alpha1.PodCliqueSet { | ||
| return testutils.NewPodCliqueSetBuilder("test-pcs", "default", ""). | ||
| WithAnnotations(annotations). | ||
| WithPodCliqueTemplateSpec( | ||
| testutils.NewPodCliqueTemplateSpecBuilder("worker"). | ||
| WithContainer(testutils.NewGPUContainer("train", "nvidia/cuda:latest", 8)). | ||
| Build(), | ||
| ). | ||
| Build() | ||
| } | ||
|
|
||
| // createPCSWithoutGPU creates a PCS without GPU using the builder for tests in this package. | ||
| func createPCSWithoutGPU(annotations map[string]string) *grovecorev1alpha1.PodCliqueSet { | ||
| return testutils.NewPodCliqueSetBuilder("test-pcs", "default", ""). | ||
| WithAnnotations(annotations). | ||
| WithPodCliqueTemplateSpec( | ||
| testutils.NewPodCliqueTemplateSpecBuilder("worker"). | ||
| WithContainer(testutils.NewContainer("app", "nginx:latest")). | ||
| Build(), | ||
| ). | ||
| Build() | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| // /* | ||
| // Copyright 2026 The Grove Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
| // */ | ||
|
|
||
| package mnnvl | ||
|
|
||
| import ( | ||
| "fmt" | ||
|
|
||
| grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" | ||
| ) | ||
|
|
||
| // MutateAutoMNNVL adds the grove.io/auto-mnnvl annotation to a PodCliqueSet | ||
| // if all conditions are met: | ||
| // 1. Annotation does not already exist | ||
| // 2. MNNVL feature is enabled globally (autoMNNVLEnabled) | ||
| // 3. PCS has at least one container requesting GPU | ||
| // | ||
| // Returns true if the annotation was added, false otherwise. | ||
| func MutateAutoMNNVL(pcs *grovecorev1alpha1.PodCliqueSet, autoMNNVLEnabled bool) bool { | ||
| // If feature is disabled, don't add annotation | ||
| if !autoMNNVLEnabled { | ||
| return false | ||
| } | ||
|
|
||
| // If annotation already exists (user explicitly set it), don't override | ||
| if pcs.Annotations != nil { | ||
| if _, exists := pcs.Annotations[AnnotationAutoMNNVL]; exists { | ||
| return false | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a debug log here |
||
| } | ||
| } | ||
|
|
||
| // Check if PCS has GPU requirements | ||
| if !hasGPURequirement(pcs) { | ||
| return false | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a debug log here (I don't expect it to happen frequently) |
||
| } | ||
|
|
||
| // All conditions met - add the annotation | ||
| if pcs.Annotations == nil { | ||
| pcs.Annotations = make(map[string]string) | ||
| } | ||
| pcs.Annotations[AnnotationAutoMNNVL] = "true" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. repeating a comment from previous PR - just in case - are we using the values of true/false or perhaps something like enabled/required/auto/etc.? |
||
| return true | ||
| } | ||
|
|
||
| // ValidateAutoMNNVLOnCreate validates the MNNVL annotation on PCS creation. | ||
| // Returns an error if the annotation is set to "true" but the MNNVL feature is disabled. | ||
| // This prevents users from explicitly requesting MNNVL when the cluster doesn't support it. | ||
| func ValidateAutoMNNVLOnCreate(pcs *grovecorev1alpha1.PodCliqueSet, autoMNNVLEnabled bool) error { | ||
| value, exists := pcs.Annotations[AnnotationAutoMNNVL] | ||
| if !exists { | ||
| return nil | ||
| } | ||
|
|
||
| // If annotation is "true" but feature is disabled, reject | ||
| if value == "true" && !autoMNNVLEnabled { | ||
| return fmt.Errorf("MNNVL is not enabled in the operator configuration. "+ | ||
| "Either enable MNNVL globally or remove the %s annotation", AnnotationAutoMNNVL) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // ValidateAutoMNNVLOnUpdate ensures the grove.io/auto-mnnvl annotation is immutable. | ||
| // Returns an error if the annotation was added, removed, or its value was changed. | ||
| func ValidateAutoMNNVLOnUpdate(oldPCS, newPCS *grovecorev1alpha1.PodCliqueSet) error { | ||
| oldValue, oldExists := getAnnotationValue(oldPCS, AnnotationAutoMNNVL) | ||
| newValue, newExists := getAnnotationValue(newPCS, AnnotationAutoMNNVL) | ||
|
|
||
| // Check if annotation was added | ||
| if !oldExists && newExists { | ||
| return fmt.Errorf("annotation %s cannot be added after PodCliqueSet creation", AnnotationAutoMNNVL) | ||
| } | ||
|
|
||
| // Check if annotation was removed | ||
| if oldExists && !newExists { | ||
| return fmt.Errorf("annotation %s cannot be removed after PodCliqueSet creation", AnnotationAutoMNNVL) | ||
| } | ||
|
|
||
| // Check if annotation value was changed | ||
| if newExists && oldValue != newValue { | ||
| return fmt.Errorf("annotation %s is immutable and cannot be changed from %q to %q", | ||
| AnnotationAutoMNNVL, oldValue, newValue) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should raise the question about DRA-provided gpu requests, where the claim name is provided but the actual claim details are separated to a ResourceClaimTemplate resource.