Skip to content

Commit 2b70208

Browse files
committed
Auto-MNNVL: Update PCS's webhook support auto-mnnvl
Mutating webhook (on create): - Automatically adds grove.io/auto-mnnvl: "true" annotation when MNNVL is globally enabled and PCS requests GPUs - Respects existing annotation values (allows opt-out with "false") Validating webhook (on create): - Rejects PCS with auto-mnnvl: "true" if MNNVL feature is disabled Validating webhook (on update): - Enforces annotation immutability after creation General: - Add flexible WithContainer/WithInitContainer builder methods - Add NewGPUContainer/NewContainer helper functions - Add WithAnnotations to PodCliqueSetBuilder - Remove deprecated test-envtest from the make test
1 parent 16bf3c9 commit 2b70208

File tree

20 files changed

+1028
-35
lines changed

20 files changed

+1028
-35
lines changed

Makefile

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,21 +100,15 @@ cover-html:
100100
@echo "> Generating HTML coverage report for operator"
101101
@make --directory=operator cover-html
102102

103-
# Runs envtest tests for the operator
104-
.PHONY: test-envtest
105-
test-envtest:
106-
@echo "> Running envtest for operator"
107-
@make --directory=operator test-envtest
108-
109103
# Runs e2e tests for the operator
110104
.PHONY: test-e2e
111105
test-e2e:
112106
@echo "> Running e2e tests for operator"
113107
@make --directory=operator test-e2e
114108

115-
# Runs all tests (unit + envtest)
109+
# Runs all tests
116110
.PHONY: test
117-
test: test-unit test-envtest
111+
test: test-unit
118112
@echo "> All tests passed"
119113

120114
# Updates the docs/proposals table of contents

operator/internal/constants/gpu.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// /*
2+
// Copyright 2026 The Grove Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
// */
16+
17+
package constants
18+
19+
import (
20+
corev1 "k8s.io/api/core/v1"
21+
)
22+
23+
// GPU resource constants
24+
const (
25+
// GPUResourceName is the resource name for NVIDIA GPUs. It is used to determine
26+
// if a pod requests NVIDIA GPUs.
27+
GPUResourceName corev1.ResourceName = "nvidia.com/gpu"
28+
)

operator/internal/controller/manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func RegisterControllersAndWebhooks(mgr ctrl.Manager, logger logr.Logger, operat
6262
if err := registerControllersWithMgr(mgr, operatorCfg.Controllers, operatorCfg.TopologyAwareScheduling); err != nil {
6363
return err
6464
}
65-
if err := registerWebhooksWithMgr(mgr, operatorCfg.Authorizer, operatorCfg.TopologyAwareScheduling); err != nil {
65+
if err := registerWebhooksWithMgr(mgr, operatorCfg.Authorizer, operatorCfg.TopologyAwareScheduling, operatorCfg.Network); err != nil {
6666
return err
6767
}
6868
return nil

operator/internal/controller/manager_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ func TestRegisterControllersAndWebhooks(t *testing.T) {
556556
controllersCalled = true
557557
return tc.controllerErr
558558
}
559-
registerWebhooksWithMgr = func(_ ctrl.Manager, _ configv1alpha1.AuthorizerConfig, _ configv1alpha1.TopologyAwareSchedulingConfiguration) error {
559+
registerWebhooksWithMgr = func(_ ctrl.Manager, _ configv1alpha1.AuthorizerConfig, _ configv1alpha1.TopologyAwareSchedulingConfiguration, _ configv1alpha1.NetworkAcceleration) error {
560560
webhooksCalled = true
561561
return tc.webhookErr
562562
}

operator/internal/mnnvl/helpers.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// /*
2+
// Copyright 2025 The Grove Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
// */
16+
17+
package mnnvl
18+
19+
import (
20+
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
21+
"github.com/ai-dynamo/grove/operator/internal/constants"
22+
23+
corev1 "k8s.io/api/core/v1"
24+
)
25+
26+
// hasGPURequirement checks if any container in any clique of the PCS requests nvidia.com/gpu.
27+
func hasGPURequirement(pcs *grovecorev1alpha1.PodCliqueSet) bool {
28+
for _, clique := range pcs.Spec.Template.Cliques {
29+
if clique == nil {
30+
continue
31+
}
32+
if hasGPUInContainers(clique.Spec.PodSpec.Containers) {
33+
return true
34+
}
35+
if hasGPUInContainers(clique.Spec.PodSpec.InitContainers) {
36+
return true
37+
}
38+
}
39+
return false
40+
}
41+
42+
// hasGPUInContainers checks if any container in the slice requests GPU resources.
43+
func hasGPUInContainers(containers []corev1.Container) bool {
44+
for _, container := range containers {
45+
// Check limits
46+
if quantity, exists := container.Resources.Limits[constants.GPUResourceName]; exists {
47+
if !quantity.IsZero() {
48+
return true
49+
}
50+
}
51+
// Check requests
52+
if quantity, exists := container.Resources.Requests[constants.GPUResourceName]; exists {
53+
if !quantity.IsZero() {
54+
return true
55+
}
56+
}
57+
}
58+
return false
59+
}
60+
61+
// getAnnotationValue safely retrieves an annotation value from a PCS.
62+
func getAnnotationValue(pcs *grovecorev1alpha1.PodCliqueSet, key string) (string, bool) {
63+
if pcs.Annotations == nil {
64+
return "", false
65+
}
66+
value, exists := pcs.Annotations[key]
67+
return value, exists
68+
}
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// /*
2+
// Copyright 2025 The Grove Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
// */
16+
17+
package mnnvl
18+
19+
import (
20+
"testing"
21+
22+
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
23+
"github.com/ai-dynamo/grove/operator/internal/constants"
24+
testutils "github.com/ai-dynamo/grove/operator/test/utils"
25+
26+
"github.com/stretchr/testify/assert"
27+
corev1 "k8s.io/api/core/v1"
28+
"k8s.io/apimachinery/pkg/api/resource"
29+
)
30+
31+
func Test_hasGPURequirement(t *testing.T) {
32+
tests := []struct {
33+
name string
34+
pcs *grovecorev1alpha1.PodCliqueSet
35+
expected bool
36+
}{
37+
{
38+
name: "container with GPU limits",
39+
pcs: createPCSWithGPU(nil),
40+
expected: true,
41+
},
42+
{
43+
name: "container without GPU",
44+
pcs: createPCSWithoutGPU(nil),
45+
expected: false,
46+
},
47+
{
48+
name: "empty cliques",
49+
pcs: &grovecorev1alpha1.PodCliqueSet{},
50+
expected: false,
51+
},
52+
{
53+
name: "GPU in init container",
54+
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
55+
WithPodCliqueTemplateSpec(
56+
testutils.NewPodCliqueTemplateSpecBuilder("worker").
57+
WithInitContainer(corev1.Container{
58+
Name: "init",
59+
Resources: corev1.ResourceRequirements{
60+
Limits: corev1.ResourceList{
61+
constants.GPUResourceName: resource.MustParse("1"),
62+
},
63+
},
64+
}).
65+
Build(),
66+
).
67+
Build(),
68+
expected: true,
69+
},
70+
{
71+
name: "GPU in requests not limits",
72+
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
73+
WithPodCliqueTemplateSpec(
74+
testutils.NewPodCliqueTemplateSpecBuilder("worker").
75+
WithContainer(corev1.Container{
76+
Name: "train",
77+
Resources: corev1.ResourceRequirements{
78+
Requests: corev1.ResourceList{
79+
constants.GPUResourceName: resource.MustParse("2"),
80+
},
81+
},
82+
}).
83+
Build(),
84+
).
85+
Build(),
86+
expected: true,
87+
},
88+
{
89+
name: "GPU with zero quantity",
90+
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
91+
WithPodCliqueTemplateSpec(
92+
testutils.NewPodCliqueTemplateSpecBuilder("worker").
93+
WithContainer(corev1.Container{
94+
Name: "train",
95+
Resources: corev1.ResourceRequirements{
96+
Limits: corev1.ResourceList{
97+
constants.GPUResourceName: resource.MustParse("0"),
98+
},
99+
},
100+
}).
101+
Build(),
102+
).
103+
Build(),
104+
expected: false,
105+
},
106+
{
107+
name: "multiple cliques - one with GPU",
108+
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
109+
WithPodCliqueTemplateSpec(
110+
testutils.NewPodCliqueTemplateSpecBuilder("controller").
111+
WithContainer(testutils.NewContainer("ctrl", "busybox")).
112+
Build(),
113+
).
114+
WithPodCliqueTemplateSpec(
115+
testutils.NewPodCliqueTemplateSpecBuilder("worker").
116+
WithContainer(testutils.NewGPUContainer("train", "nvidia/cuda:latest", 8)).
117+
Build(),
118+
).
119+
Build(),
120+
expected: true,
121+
},
122+
}
123+
124+
for _, test := range tests {
125+
t.Run(test.name, func(t *testing.T) {
126+
result := hasGPURequirement(test.pcs)
127+
assert.Equal(t, test.expected, result)
128+
})
129+
}
130+
}
131+
132+
// createPCSWithGPU creates a PCS with GPU using the builder for tests in this package.
133+
func createPCSWithGPU(annotations map[string]string) *grovecorev1alpha1.PodCliqueSet {
134+
return testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
135+
WithAnnotations(annotations).
136+
WithPodCliqueTemplateSpec(
137+
testutils.NewPodCliqueTemplateSpecBuilder("worker").
138+
WithContainer(testutils.NewGPUContainer("train", "nvidia/cuda:latest", 8)).
139+
Build(),
140+
).
141+
Build()
142+
}
143+
144+
// createPCSWithoutGPU creates a PCS without GPU using the builder for tests in this package.
145+
func createPCSWithoutGPU(annotations map[string]string) *grovecorev1alpha1.PodCliqueSet {
146+
return testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
147+
WithAnnotations(annotations).
148+
WithPodCliqueTemplateSpec(
149+
testutils.NewPodCliqueTemplateSpecBuilder("worker").
150+
WithContainer(testutils.NewContainer("app", "nginx:latest")).
151+
Build(),
152+
).
153+
Build()
154+
}

operator/internal/mnnvl/webhook.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
// /*
2+
// Copyright 2026 The Grove Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
// */
16+
17+
package mnnvl
18+
19+
import (
20+
"fmt"
21+
22+
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
23+
)
24+
25+
// MutateAutoMNNVL adds the grove.io/auto-mnnvl annotation to a PodCliqueSet
26+
// if all conditions are met:
27+
// 1. Annotation does not already exist
28+
// 2. MNNVL feature is enabled globally (autoMNNVLEnabled)
29+
// 3. PCS has at least one container requesting GPU
30+
//
31+
// Returns true if the annotation was added, false otherwise.
32+
func MutateAutoMNNVL(pcs *grovecorev1alpha1.PodCliqueSet, autoMNNVLEnabled bool) bool {
33+
// If feature is disabled, don't add annotation
34+
if !autoMNNVLEnabled {
35+
return false
36+
}
37+
38+
// If annotation already exists (user explicitly set it), don't override
39+
if pcs.Annotations != nil {
40+
if _, exists := pcs.Annotations[AnnotationAutoMNNVL]; exists {
41+
return false
42+
}
43+
}
44+
45+
// Check if PCS has GPU requirements
46+
if !hasGPURequirement(pcs) {
47+
return false
48+
}
49+
50+
// All conditions met - add the annotation
51+
if pcs.Annotations == nil {
52+
pcs.Annotations = make(map[string]string)
53+
}
54+
pcs.Annotations[AnnotationAutoMNNVL] = "true"
55+
return true
56+
}
57+
58+
// ValidateAutoMNNVLOnCreate validates the MNNVL annotation on PCS creation.
59+
// Returns an error if the annotation is set to "true" but the MNNVL feature is disabled.
60+
// This prevents users from explicitly requesting MNNVL when the cluster doesn't support it.
61+
func ValidateAutoMNNVLOnCreate(pcs *grovecorev1alpha1.PodCliqueSet, autoMNNVLEnabled bool) error {
62+
value, exists := pcs.Annotations[AnnotationAutoMNNVL]
63+
if !exists {
64+
return nil
65+
}
66+
67+
// If annotation is "true" but feature is disabled, reject
68+
if value == "true" && !autoMNNVLEnabled {
69+
return fmt.Errorf("MNNVL is not enabled in the operator configuration. "+
70+
"Either enable MNNVL globally or remove the %s annotation", AnnotationAutoMNNVL)
71+
}
72+
73+
return nil
74+
}
75+
76+
// ValidateAutoMNNVLOnUpdate ensures the grove.io/auto-mnnvl annotation is immutable.
77+
// Returns an error if the annotation was added, removed, or its value was changed.
78+
func ValidateAutoMNNVLOnUpdate(oldPCS, newPCS *grovecorev1alpha1.PodCliqueSet) error {
79+
oldValue, oldExists := getAnnotationValue(oldPCS, AnnotationAutoMNNVL)
80+
newValue, newExists := getAnnotationValue(newPCS, AnnotationAutoMNNVL)
81+
82+
// Check if annotation was added
83+
if !oldExists && newExists {
84+
return fmt.Errorf("annotation %s cannot be added after PodCliqueSet creation", AnnotationAutoMNNVL)
85+
}
86+
87+
// Check if annotation was removed
88+
if oldExists && !newExists {
89+
return fmt.Errorf("annotation %s cannot be removed after PodCliqueSet creation", AnnotationAutoMNNVL)
90+
}
91+
92+
// Check if annotation value was changed
93+
if newExists && oldValue != newValue {
94+
return fmt.Errorf("annotation %s is immutable and cannot be changed from %q to %q",
95+
AnnotationAutoMNNVL, oldValue, newValue)
96+
}
97+
98+
return nil
99+
}

0 commit comments

Comments
 (0)