Skip to content

Commit 0411a1e

Browse files
authored
Auto-MNNVL: add MNNVL configuration and startup validation (#346)
Ref: GREP-270 Add MNNVL configuration support to the Grove operator. This includes configuration, startup validation, and Helm chart integration. Changes: - Add MNNVLConfiguration struct with Enabled field (default: false) and to the OperatorConfiguration - Add validateMNNVLPrerequisites() to check for ComputeDomain CRD at startup - Update Helm chart values.yaml and _helpers.tpl with mnnvl.enabled - Add unit tests for config parsing and CRD detection logic When MNNVL is enabled, but the ComputeDomain CRD is not installed, the operator exits with a non-zero exit code. Tested - UT
1 parent 2ac8b96 commit 0411a1e

File tree

13 files changed

+413
-0
lines changed

13 files changed

+413
-0
lines changed

docs/api-reference/operator-api.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,22 @@ _Appears in:_
781781
| `error` | ErrorLevel is a log level where only errors are logged.<br /> |
782782

783783

784+
#### NetworkAcceleration
785+
786+
787+
788+
NetworkAcceleration defines the configuration for network acceleration features.
789+
790+
791+
792+
_Appears in:_
793+
- [OperatorConfiguration](#operatorconfiguration)
794+
795+
| Field | Description | Default | Validation |
796+
| --- | --- | --- | --- |
797+
| `autoMNNVLEnabled` _boolean_ | AutoMNNVLEnabled indicates whether automatic MNNVL (Multi-Node NVLink) support is enabled.<br />When enabled, the operator will automatically create and manage ComputeDomain resources<br />for GPU workloads. If the cluster doesn't have the NVIDIA DRA driver installed,<br />the operator will exit with a non-zero exit code.<br />Default: false | | |
798+
799+
784800

785801

786802
#### PodCliqueControllerConfiguration

operator/api/config/v1alpha1/types.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ type OperatorConfiguration struct {
6565
LogFormat LogFormat `json:"logFormat"`
6666
Authorizer AuthorizerConfig `json:"authorizer"`
6767
TopologyAwareScheduling TopologyAwareSchedulingConfiguration `json:"topologyAwareScheduling"`
68+
// +optional
69+
Network NetworkAcceleration `json:"network,omitempty"` // Network is the configuration for network acceleration features like MNNVL.
6870
}
6971

7072
// LeaderElectionConfiguration defines the configuration for the leader election.
@@ -200,3 +202,13 @@ type TopologyAwareSchedulingConfiguration struct {
200202
// +optional
201203
Levels []corev1alpha1.TopologyLevel `json:"levels,omitempty"`
202204
}
205+
206+
// NetworkAcceleration defines the configuration for network acceleration features.
207+
type NetworkAcceleration struct {
208+
// AutoMNNVLEnabled indicates whether automatic MNNVL (Multi-Node NVLink) support is enabled.
209+
// When enabled, the operator will automatically create and manage ComputeDomain resources
210+
// for GPU workloads. If the cluster doesn't have the NVIDIA DRA driver installed,
211+
// the operator will exit with a non-zero exit code.
212+
// Default: false
213+
AutoMNNVLEnabled bool `json:"autoMNNVLEnabled"`
214+
}

operator/api/config/v1alpha1/zz_generated.deepcopy.go

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

operator/charts/templates/_helpers.tpl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ config.yaml: |
5555
[]
5656
{{- end }}
5757
{{- end }}
58+
{{- if .Values.config.network }}
59+
network:
60+
autoMNNVLEnabled: {{ .Values.config.network.autoMNNVLEnabled | default false }}
61+
{{- end }}
5862

5963
{{- end -}}
6064

operator/charts/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ config:
8585
- system:kube-controller-manager
8686
- system:serviceaccount:kai-scheduler:pod-grouper
8787
- system:serviceaccount:kai-scheduler:binder
88+
network:
89+
autoMNNVLEnabled: false
8890

8991
configMap:
9092
labels:

operator/cmd/cli/cli.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ const (
4343
ExitErrInitializeManager
4444
// ExitErrStart indicates that the application exited due to an error when starting the application.
4545
ExitErrStart
46+
// ExitErrMNNVLPrerequisites indicates that the application exited because MNNVL prerequisites are not met.
47+
ExitErrMNNVLPrerequisites
4648
)
4749

4850
var (

operator/cmd/cli/cli_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,15 @@ func TestLoadAndValidateOperatorConfig(t *testing.T) {
130130
assert.Equal(t, 9445, config.Server.Metrics.Port)
131131
assert.NotNil(t, config.Controllers.PodCliqueSet.ConcurrentSyncs)
132132
assert.Equal(t, 3, *config.Controllers.PodCliqueSet.ConcurrentSyncs)
133+
assert.False(t, config.Network.AutoMNNVLEnabled, "MNNVL should be disabled by default")
134+
},
135+
},
136+
{
137+
name: "valid config with MNNVL enabled",
138+
configFile: "testdata/valid-config-mnnvl-enabled.yaml",
139+
validateFunc: func(t *testing.T, config *configv1alpha1.OperatorConfiguration) {
140+
require.NotNil(t, config)
141+
assert.True(t, config.Network.AutoMNNVLEnabled, "MNNVL should be enabled")
133142
},
134143
},
135144
{
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
apiVersion: operator.config.grove.io/v1alpha1
2+
kind: OperatorConfiguration
3+
runtimeClientConnection:
4+
qps: 100
5+
burst: 150
6+
leaderElection:
7+
enabled: true
8+
leaseDuration: 15s
9+
renewDeadline: 10s
10+
retryPeriod: 2s
11+
resourceLock: leases
12+
resourceName: grove-operator-leader-election
13+
resourceNamespace: default
14+
server:
15+
webhooks:
16+
bindAddress: 0.0.0.0
17+
port: 9443
18+
serverCertDir: /etc/grove-operator/webhook-certs
19+
healthProbes:
20+
bindAddress: 0.0.0.0
21+
port: 9444
22+
metrics:
23+
bindAddress: 0.0.0.0
24+
port: 9445
25+
controllers:
26+
podCliqueSet:
27+
concurrentSyncs: 3
28+
podClique:
29+
concurrentSyncs: 3
30+
podCliqueScalingGroup:
31+
concurrentSyncs: 2
32+
logLevel: info
33+
logFormat: json
34+
authorizer:
35+
enabled: false
36+
clusterTopology:
37+
enabled: false
38+
network:
39+
autoMNNVLEnabled: true
40+

operator/cmd/cli/testdata/valid-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,6 @@ authorizer:
3535
enabled: false
3636
clusterTopology:
3737
enabled: false
38+
network:
39+
autoMNNVLEnabled: false
3840

operator/cmd/main.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
grovectrl "github.com/ai-dynamo/grove/operator/internal/controller"
3232
"github.com/ai-dynamo/grove/operator/internal/controller/cert"
3333
grovelogger "github.com/ai-dynamo/grove/operator/internal/logger"
34+
"github.com/ai-dynamo/grove/operator/internal/mnnvl"
3435
groveversion "github.com/ai-dynamo/grove/operator/internal/version"
3536

3637
"github.com/spf13/pflag"
@@ -63,6 +64,12 @@ func main() {
6364
logger.Info("Starting grove operator", "grove-info", groveInfo.Verbose())
6465
printFlags()
6566

67+
// Run MNNVL preflight checks if the feature is enabled
68+
if err := mnnvl.Preflight(operatorConfig); err != nil {
69+
logger.Error(err, "MNNVL preflight check failed")
70+
handleErrorAndExit(err, cli.ExitErrMNNVLPrerequisites)
71+
}
72+
6673
mgr, err := grovectrl.CreateManager(operatorConfig)
6774
if err != nil {
6875
logger.Error(err, "failed to create grove controller manager")

0 commit comments

Comments
 (0)