Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,15 @@ cover-html:
@echo "> Generating HTML coverage report for operator"
@make --directory=operator cover-html

# Runs envtest tests for the operator
.PHONY: test-envtest
test-envtest:
@echo "> Running envtest for operator"
@make --directory=operator test-envtest

# Runs e2e tests for the operator
.PHONY: test-e2e
test-e2e:
@echo "> Running e2e tests for operator"
@make --directory=operator test-e2e

# Runs all tests (unit + envtest)
# Runs all tests
.PHONY: test
test: test-unit test-envtest
test: test-unit
@echo "> All tests passed"

# Updates the docs/proposals table of contents
Expand Down
28 changes: 28 additions & 0 deletions operator/internal/constants/gpu.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// /*
// Copyright 2026 The Grove Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// */

package constants

import (
corev1 "k8s.io/api/core/v1"
)

// GPU resource constants
const (
// GPUResourceName is the resource name for NVIDIA GPUs. It is used to determine
// if a pod requests NVIDIA GPUs.
GPUResourceName corev1.ResourceName = "nvidia.com/gpu"
)
2 changes: 1 addition & 1 deletion operator/internal/controller/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func RegisterControllersAndWebhooks(mgr ctrl.Manager, logger logr.Logger, operat
if err := registerControllersWithMgr(mgr, operatorCfg.Controllers, operatorCfg.TopologyAwareScheduling); err != nil {
return err
}
if err := registerWebhooksWithMgr(mgr, operatorCfg.Authorizer, operatorCfg.TopologyAwareScheduling); err != nil {
if err := registerWebhooksWithMgr(mgr, operatorCfg.Authorizer, operatorCfg.TopologyAwareScheduling, operatorCfg.Network); err != nil {
return err
}
return nil
Expand Down
2 changes: 1 addition & 1 deletion operator/internal/controller/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ func TestRegisterControllersAndWebhooks(t *testing.T) {
controllersCalled = true
return tc.controllerErr
}
registerWebhooksWithMgr = func(_ ctrl.Manager, _ configv1alpha1.AuthorizerConfig, _ configv1alpha1.TopologyAwareSchedulingConfiguration) error {
registerWebhooksWithMgr = func(_ ctrl.Manager, _ configv1alpha1.AuthorizerConfig, _ configv1alpha1.TopologyAwareSchedulingConfiguration, _ configv1alpha1.NetworkAcceleration) error {
webhooksCalled = true
return tc.webhookErr
}
Expand Down
68 changes: 68 additions & 0 deletions operator/internal/mnnvl/helpers.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// /*
// Copyright 2025 The Grove Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// */

package mnnvl

import (
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/grove/operator/internal/constants"

corev1 "k8s.io/api/core/v1"
)

// hasGPURequirement checks if any container in any clique of the PCS requests nvidia.com/gpu.
func hasGPURequirement(pcs *grovecorev1alpha1.PodCliqueSet) bool {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should raise the question about DRA-provided gpu requests, where the claim name is provided but the actual claim details are separated to a ResourceClaimTemplate resource.

for _, clique := range pcs.Spec.Template.Cliques {
if clique == nil {
continue
}
if hasGPUInContainers(clique.Spec.PodSpec.Containers) {
return true
}
if hasGPUInContainers(clique.Spec.PodSpec.InitContainers) {
return true
}
}
return false
}

// hasGPUInContainers checks if any container in the slice requests GPU resources.
func hasGPUInContainers(containers []corev1.Container) bool {
for _, container := range containers {
// Check limits
if quantity, exists := container.Resources.Limits[constants.GPUResourceName]; exists {
if !quantity.IsZero() {
return true
}
}
// Check requests
if quantity, exists := container.Resources.Requests[constants.GPUResourceName]; exists {
if !quantity.IsZero() {
return true
}
}
}
return false
}

// getAnnotationValue safely retrieves an annotation value from a PCS.
func getAnnotationValue(pcs *grovecorev1alpha1.PodCliqueSet, key string) (string, bool) {
if pcs.Annotations == nil {
return "", false
}
value, exists := pcs.Annotations[key]
return value, exists
}
154 changes: 154 additions & 0 deletions operator/internal/mnnvl/helpers_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// /*
// Copyright 2025 The Grove Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// */

package mnnvl

import (
"testing"

grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/grove/operator/internal/constants"
testutils "github.com/ai-dynamo/grove/operator/test/utils"

"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)

func Test_hasGPURequirement(t *testing.T) {
tests := []struct {
name string
pcs *grovecorev1alpha1.PodCliqueSet
expected bool
}{
{
name: "container with GPU limits",
pcs: createPCSWithGPU(nil),
expected: true,
},
{
name: "container without GPU",
pcs: createPCSWithoutGPU(nil),
expected: false,
},
{
name: "empty cliques",
pcs: &grovecorev1alpha1.PodCliqueSet{},
expected: false,
},
{
name: "GPU in init container",
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
WithPodCliqueTemplateSpec(
testutils.NewPodCliqueTemplateSpecBuilder("worker").
WithInitContainer(corev1.Container{
Name: "init",
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
constants.GPUResourceName: resource.MustParse("1"),
},
},
}).
Build(),
).
Build(),
expected: true,
},
{
name: "GPU in requests not limits",
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
WithPodCliqueTemplateSpec(
testutils.NewPodCliqueTemplateSpecBuilder("worker").
WithContainer(corev1.Container{
Name: "train",
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
constants.GPUResourceName: resource.MustParse("2"),
},
},
}).
Build(),
).
Build(),
expected: true,
},
{
name: "GPU with zero quantity",
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
WithPodCliqueTemplateSpec(
testutils.NewPodCliqueTemplateSpecBuilder("worker").
WithContainer(corev1.Container{
Name: "train",
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
constants.GPUResourceName: resource.MustParse("0"),
},
},
}).
Build(),
).
Build(),
expected: false,
},
{
name: "multiple cliques - one with GPU",
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
WithPodCliqueTemplateSpec(
testutils.NewPodCliqueTemplateSpecBuilder("controller").
WithContainer(testutils.NewContainer("ctrl", "busybox")).
Build(),
).
WithPodCliqueTemplateSpec(
testutils.NewPodCliqueTemplateSpecBuilder("worker").
WithContainer(testutils.NewGPUContainer("train", "nvidia/cuda:latest", 8)).
Build(),
).
Build(),
expected: true,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
result := hasGPURequirement(test.pcs)
assert.Equal(t, test.expected, result)
})
}
}

// createPCSWithGPU creates a PCS with GPU using the builder for tests in this package.
func createPCSWithGPU(annotations map[string]string) *grovecorev1alpha1.PodCliqueSet {
return testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
WithAnnotations(annotations).
WithPodCliqueTemplateSpec(
testutils.NewPodCliqueTemplateSpecBuilder("worker").
WithContainer(testutils.NewGPUContainer("train", "nvidia/cuda:latest", 8)).
Build(),
).
Build()
}

// createPCSWithoutGPU creates a PCS without GPU using the builder for tests in this package.
func createPCSWithoutGPU(annotations map[string]string) *grovecorev1alpha1.PodCliqueSet {
return testutils.NewPodCliqueSetBuilder("test-pcs", "default", "").
WithAnnotations(annotations).
WithPodCliqueTemplateSpec(
testutils.NewPodCliqueTemplateSpecBuilder("worker").
WithContainer(testutils.NewContainer("app", "nginx:latest")).
Build(),
).
Build()
}
99 changes: 99 additions & 0 deletions operator/internal/mnnvl/webhook.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// /*
// Copyright 2026 The Grove Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// */

package mnnvl

import (
"fmt"

grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
)

// MutateAutoMNNVL adds the grove.io/auto-mnnvl annotation to a PodCliqueSet
// if all conditions are met:
// 1. Annotation does not already exist
// 2. MNNVL feature is enabled globally (autoMNNVLEnabled)
// 3. PCS has at least one container requesting GPU
//
// Returns true if the annotation was added, false otherwise.
func MutateAutoMNNVL(pcs *grovecorev1alpha1.PodCliqueSet, autoMNNVLEnabled bool) bool {
// If feature is disabled, don't add annotation
if !autoMNNVLEnabled {
return false
}

// If annotation already exists (user explicitly set it), don't override
if pcs.Annotations != nil {
if _, exists := pcs.Annotations[AnnotationAutoMNNVL]; exists {
return false

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a debug log here

}
}

// Check if PCS has GPU requirements
if !hasGPURequirement(pcs) {
return false

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a debug log here (I don't expect it to happen frequently)

}

// All conditions met - add the annotation
if pcs.Annotations == nil {
pcs.Annotations = make(map[string]string)
}
pcs.Annotations[AnnotationAutoMNNVL] = "true"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

repeating a comment from previous PR - just in case - are we using the values of true/false or perhaps something like enabled/required/auto/etc.?

return true
}

// ValidateAutoMNNVLOnCreate validates the MNNVL annotation on PCS creation.
// Returns an error if the annotation is set to "true" but the MNNVL feature is disabled.
// This prevents users from explicitly requesting MNNVL when the cluster doesn't support it.
func ValidateAutoMNNVLOnCreate(pcs *grovecorev1alpha1.PodCliqueSet, autoMNNVLEnabled bool) error {
value, exists := pcs.Annotations[AnnotationAutoMNNVL]
if !exists {
return nil
}

// If annotation is "true" but feature is disabled, reject
if value == "true" && !autoMNNVLEnabled {
return fmt.Errorf("MNNVL is not enabled in the operator configuration. "+
"Either enable MNNVL globally or remove the %s annotation", AnnotationAutoMNNVL)
}

return nil
}

// ValidateAutoMNNVLOnUpdate ensures the grove.io/auto-mnnvl annotation is immutable.
// Returns an error if the annotation was added, removed, or its value was changed.
func ValidateAutoMNNVLOnUpdate(oldPCS, newPCS *grovecorev1alpha1.PodCliqueSet) error {
oldValue, oldExists := getAnnotationValue(oldPCS, AnnotationAutoMNNVL)
newValue, newExists := getAnnotationValue(newPCS, AnnotationAutoMNNVL)

// Check if annotation was added
if !oldExists && newExists {
return fmt.Errorf("annotation %s cannot be added after PodCliqueSet creation", AnnotationAutoMNNVL)
}

// Check if annotation was removed
if oldExists && !newExists {
return fmt.Errorf("annotation %s cannot be removed after PodCliqueSet creation", AnnotationAutoMNNVL)
}

// Check if annotation value was changed
if newExists && oldValue != newValue {
return fmt.Errorf("annotation %s is immutable and cannot be changed from %q to %q",
AnnotationAutoMNNVL, oldValue, newValue)
}

return nil
}
Loading
Loading