Skip to content

Commit b04453a

Browse files
Shiva Kumarshivakunv
authored andcommitted
vgpu-manager: enable kernel module configuration via KernelModuleConfig
Signed-off-by: Shiva Kumar (SW-CLOUD) <shivaku@nvidia.com>
1 parent a846cef commit b04453a

File tree

10 files changed

+107
-22
lines changed

10 files changed

+107
-22
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,11 @@ type VGPUManagerSpec struct {
651651

652652
// DriverManager represents configuration for NVIDIA Driver Manager initContainer
653653
DriverManager DriverManagerSpec `json:"driverManager,omitempty"`
654+
655+
// Optional: Kernel module configuration parameters for the vGPU manager
656+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
657+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Kernel module configuration parameters for the vGPU manager"
658+
KernelModuleConfig *KernelModuleConfigSpec `json:"kernelModuleConfig,omitempty"`
654659
}
655660

656661
// ToolkitSpec defines the properties for NVIDIA Container Toolkit deployment

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2321,6 +2321,13 @@ spec:
23212321
items:
23222322
type: string
23232323
type: array
2324+
kernelModuleConfig:
2325+
description: 'Optional: Kernel module configuration parameters
2326+
for the vGPU manager'
2327+
properties:
2328+
name:
2329+
type: string
2330+
type: object
23242331
repository:
23252332
description: NVIDIA vGPU Manager image repository
23262333
type: string

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2321,6 +2321,13 @@ spec:
23212321
items:
23222322
type: string
23232323
type: array
2324+
kernelModuleConfig:
2325+
description: 'Optional: Kernel module configuration parameters
2326+
for the vGPU manager'
2327+
properties:
2328+
name:
2329+
type: string
2330+
type: object
23242331
repository:
23252332
description: NVIDIA vGPU Manager image repository
23262333
type: string

config/samples/v1_clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ spec:
101101

102102
vgpuManager:
103103
enabled: true
104+
# kernel module configuration for vGPU manager
105+
kernelModuleConfig:
106+
name: ""
104107

105108
vgpuDeviceManager:
106109
enabled: true

controllers/object_controls.go

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,10 @@ const (
183183
NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT"
184184
// NRIAnnotationDomain represents the domain name used for NRI annotations used for CDI device injections
185185
NRIAnnotationDomain = "nvidia.cdi.k8s.io"
186+
187+
// driversDir is the name of the directory used by the driver-container to represent the path
188+
// of the drivers directory mounted in the container
189+
driversDir = "/drivers"
186190
)
187191

188192
// ContainerProbe defines container probe types
@@ -2887,13 +2891,21 @@ func transformPeerMemoryContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPo
28872891
if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
28882892
// note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
28892893
// Only add a VolumeMount for nvidia-peermem-ctr.
2890-
destinationDir := "/drivers"
2891-
volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
2894+
volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, driversDir)
28922895
if err != nil {
28932896
return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
28942897
}
28952898
obj.Spec.Template.Spec.Containers[i].VolumeMounts = append(obj.Spec.Template.Spec.Containers[i].VolumeMounts, volumeMounts...)
28962899
}
2900+
if config.VGPUManager.KernelModuleConfig != nil && config.VGPUManager.KernelModuleConfig.Name != "" {
2901+
// note: transformVGPUManagerContainer() will have already created a Volume backed by the ConfigMap.
2902+
// Only add a VolumeMount for nvidia-vgpu-manager-ctr.
2903+
volumeMounts, _, err := createConfigMapVolumeMounts(n, config.VGPUManager.KernelModuleConfig.Name, driversDir)
2904+
if err != nil {
2905+
return fmt.Errorf("failed to create ConfigMap VolumeMounts for vGPU manager kernel module configuration: %w", err)
2906+
}
2907+
obj.Spec.Template.Spec.Containers[i].VolumeMounts = append(obj.Spec.Template.Spec.Containers[i].VolumeMounts, volumeMounts...)
2908+
}
28972909
if config.Driver.Resources != nil {
28982910
obj.Spec.Template.Spec.Containers[i].Resources = corev1.ResourceRequirements{
28992911
Requests: config.Driver.Resources.Requests,
@@ -3566,8 +3578,7 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
35663578

35673579
// mount any custom kernel module configuration parameters at /drivers
35683580
if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
3569-
destinationDir := "/drivers"
3570-
volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
3581+
volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, driversDir)
35713582
if err != nil {
35723583
return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
35733584
}
@@ -3698,6 +3709,7 @@ func createSecretEnvReference(ctx context.Context, ctrlClient client.Client, sec
36983709
}
36993710

37003711
func transformVGPUManagerContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
3712+
podSpec := &obj.Spec.Template.Spec
37013713
container := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-vgpu-manager-ctr")
37023714

37033715
if container == nil {
@@ -3745,6 +3757,16 @@ func transformVGPUManagerContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterP
37453757
}
37463758
}
37473759

3760+
// mount any custom kernel module configuration parameters at /drivers
3761+
if config.VGPUManager.KernelModuleConfig != nil && config.VGPUManager.KernelModuleConfig.Name != "" {
3762+
volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.VGPUManager.KernelModuleConfig.Name, driversDir)
3763+
if err != nil {
3764+
return fmt.Errorf("failed to create ConfigMap VolumeMounts for kernel module configuration: %w", err)
3765+
}
3766+
container.VolumeMounts = append(container.VolumeMounts, volumeMounts...)
3767+
podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.VGPUManager.KernelModuleConfig.Name, itemsToInclude))
3768+
}
3769+
37483770
return nil
37493771
}
37503772

controllers/object_controls_test.go

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -104,15 +104,16 @@ var kubernetesResources = []client.Object{
104104
}
105105

106106
type commonDaemonsetSpec struct {
107-
repository string
108-
image string
109-
version string
110-
imagePullPolicy string
111-
imagePullSecrets []corev1.LocalObjectReference
112-
args []string
113-
env []gpuv1.EnvVar
114-
resources *gpuv1.ResourceRequirements
115-
startupProbe *gpuv1.ContainerProbeSpec
107+
repository string
108+
image string
109+
version string
110+
imagePullPolicy string
111+
imagePullSecrets []corev1.LocalObjectReference
112+
args []string
113+
env []gpuv1.EnvVar
114+
resources *gpuv1.ResourceRequirements
115+
startupProbe *gpuv1.ContainerProbeSpec
116+
kernelModuleConfig *gpuv1.KernelModuleConfigSpec
116117
}
117118

118119
func TestMain(m *testing.M) {
@@ -371,14 +372,15 @@ func testDaemonsetCommon(t *testing.T, cp *gpuv1.ClusterPolicy, component string
371372
}
372373
case "VGPUManager":
373374
spec = commonDaemonsetSpec{
374-
repository: cp.Spec.VGPUManager.Repository,
375-
image: cp.Spec.VGPUManager.Image,
376-
version: cp.Spec.VGPUManager.Version,
377-
imagePullPolicy: cp.Spec.VGPUManager.ImagePullPolicy,
378-
imagePullSecrets: getImagePullSecrets(cp.Spec.VGPUManager.ImagePullSecrets),
379-
args: cp.Spec.VGPUManager.Args,
380-
env: cp.Spec.VGPUManager.Env,
381-
resources: cp.Spec.VGPUManager.Resources,
375+
repository: cp.Spec.VGPUManager.Repository,
376+
image: cp.Spec.VGPUManager.Image,
377+
version: cp.Spec.VGPUManager.Version,
378+
imagePullPolicy: cp.Spec.VGPUManager.ImagePullPolicy,
379+
imagePullSecrets: getImagePullSecrets(cp.Spec.VGPUManager.ImagePullSecrets),
380+
args: cp.Spec.VGPUManager.Args,
381+
env: cp.Spec.VGPUManager.Env,
382+
resources: cp.Spec.VGPUManager.Resources,
383+
kernelModuleConfig: cp.Spec.VGPUManager.KernelModuleConfig,
382384
}
383385
dsLabel = "nvidia-vgpu-manager-daemonset"
384386
mainCtrName = "nvidia-vgpu-manager-ctr"
@@ -765,7 +767,7 @@ func getVGPUManagerTestInput(testCase string) *gpuv1.ClusterPolicy {
765767
cp.Spec.VGPUManager.ImagePullSecrets = []string{"ngc-secret"}
766768
cp.Spec.VGPUManager.DriverManager.ImagePullSecrets = []string{"ngc-secret"}
767769
clusterPolicyController.sandboxEnabled = true
768-
770+
cp.Spec.VGPUManager.KernelModuleConfig = &gpuv1.KernelModuleConfigSpec{Name: "vgpu-manager-kernel-module-config"}
769771
switch testCase {
770772
case "default":
771773
// Do nothing
@@ -785,6 +787,7 @@ func getVGPUManagerTestOutput(testCase string) map[string]interface{} {
785787
"driverImage": "nvcr.io/nvidia/vgpu-manager:470.57.02-ubuntu22.04",
786788
"driverManagerImage": "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.3.0",
787789
"imagePullSecret": "ngc-secret",
790+
"kernelModuleConfig": "vgpu-manager-kernel-module-config",
788791
}
789792

790793
switch testCase {
@@ -814,6 +817,23 @@ func TestVGPUManager(t *testing.T) {
814817

815818
for _, tc := range testCases {
816819
t.Run(tc.description, func(t *testing.T) {
820+
// Create the kernel module ConfigMap
821+
if tc.clusterPolicy.Spec.VGPUManager.KernelModuleConfig != nil && tc.clusterPolicy.Spec.VGPUManager.KernelModuleConfig.Name != "" {
822+
cm := &corev1.ConfigMap{
823+
ObjectMeta: metav1.ObjectMeta{
824+
Name: tc.clusterPolicy.Spec.VGPUManager.KernelModuleConfig.Name,
825+
Namespace: clusterPolicyController.operatorNamespace,
826+
},
827+
Data: map[string]string{
828+
"nvidia.conf": "# Test vGPU manager kernel module configuration\n",
829+
},
830+
}
831+
err := clusterPolicyController.client.Create(clusterPolicyController.ctx, cm)
832+
if err != nil {
833+
t.Fatalf("error creating kernel module ConfigMap: %v", err)
834+
}
835+
}
836+
817837
ds, err := testDaemonsetCommon(t, tc.clusterPolicy, "VGPUManager", tc.output["numDaemonsets"].(int))
818838
if err != nil {
819839
t.Fatalf("error in testDaemonsetCommon(): %v", err)
@@ -850,6 +870,9 @@ func TestVGPUManager(t *testing.T) {
850870
}
851871

852872
func TestVGPUManagerAssets(t *testing.T) {
873+
// Clear any KernelModuleConfig that might have been set by previous tests to avoid missing ConfigMap errors
874+
clusterPolicyController.singleton.Spec.VGPUManager.KernelModuleConfig = nil
875+
853876
manifestPath := filepath.Join(cfg.root, vGPUManagerAssetsPath)
854877
// add manifests
855878
addState(&clusterPolicyController, manifestPath)

deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2321,6 +2321,13 @@ spec:
23212321
items:
23222322
type: string
23232323
type: array
2324+
kernelModuleConfig:
2325+
description: 'Optional: Kernel module configuration parameters
2326+
for the vGPU manager'
2327+
properties:
2328+
name:
2329+
type: string
2330+
type: object
23242331
repository:
23252332
description: NVIDIA vGPU Manager image repository
23262333
type: string

deployments/gpu-operator/templates/clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,9 @@ spec:
279279
{{- if .Values.vgpuManager.args }}
280280
args: {{ toYaml .Values.vgpuManager.args | nindent 6 }}
281281
{{- end }}
282+
{{- if .Values.vgpuManager.kernelModuleConfig }}
283+
kernelModuleConfig: {{ toYaml .Values.vgpuManager.kernelModuleConfig | nindent 6 }}
284+
{{- end }}
282285
driverManager:
283286
{{- if .Values.vgpuManager.driverManager.repository }}
284287
repository: {{ .Values.vgpuManager.driverManager.repository }}

deployments/gpu-operator/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,9 @@ vgpuManager:
432432
version: v0.9.1
433433
imagePullPolicy: IfNotPresent
434434
env: []
435+
# kernel module configuration for vGPU manager
436+
kernelModuleConfig:
437+
name: ""
435438

436439
vgpuDeviceManager:
437440
enabled: true

0 commit comments

Comments
 (0)