Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,11 @@ type DCGMExporterSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Service configuration for NVIDIA DCGM Exporter"
ServiceSpec *DCGMExporterServiceConfig `json:"service,omitempty"`

// Optional: Annotations is an unstructured key value map stored with a resource that may be
// set by external tools to store and retrieve arbitrary metadata. They are not
// queryable and should be preserved when modifying objects.
Annotations map[string]string `json:"annotations,omitempty"`
}

// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
Expand Down
7 changes: 7 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,14 @@ spec:
version:
description: NVIDIA DCGM Exporter image tag
type: string
annotations:
additionalProperties:
type: string
description: |-
Optional: Annotations is an unstructured key value map stored with a resource that may be
set by external tools to store and retrieve arbitrary metadata. They are not
queryable and should be preserved when modifying objects.
type: object
type: object
devicePlugin:
description: DevicePlugin component spec
Expand Down
10 changes: 10 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,16 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
obj.Spec.Template.Spec.Containers[0].Args = config.DCGMExporter.Args
}

// set annotations if specified for exporter
if len(config.DCGMExporter.Annotations) > 0 {
if obj.Spec.Template.Annotations == nil {
obj.Spec.Template.Annotations = make(map[string]string)
}
for annoKey, annoVal := range config.DCGMExporter.Annotations {
obj.Spec.Template.Annotations[annoKey] = annoVal
}
}

// check if DCGM hostengine is enabled as a separate Pod and setup env accordingly
if config.DCGM.IsEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("nvidia-dcgm:%d", DCGMDefaultPort))
Expand Down
118 changes: 117 additions & 1 deletion controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ func TestTransformDCGMExporter(t *testing.T) {
expectedDs Daemonset // Expected output DaemonSet
}{
{
description: "transform dcgm exporter",
description: "transform dcgm exporter without annotations",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
Expand Down Expand Up @@ -833,6 +833,122 @@ func TestTransformDCGMExporter(t *testing.T) {
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"),
},
{
description: "transform dcgm exporter with annotations",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
Annotations: map[string]string{"dcgm-exporter": "test"},
Env: []gpuv1.EnvVar{
{Name: "foo", Value: "bar"},
},
},
DCGM: gpuv1.DCGMSpec{
Enabled: newBoolPtr(true),
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithPodAnnotations(map[string]string{"dcgm-exporter": "test"}),
},
{
description: "transform dcgm exporter with annotations and common annotations",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
Daemonsets: gpuv1.DaemonsetsSpec{Annotations: map[string]string{
"key": "value",
"app": "value",
"app.kubernetes.io/part-of": "value",
}},
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
Annotations: map[string]string{"dcgm-exporter": "test"},
Env: []gpuv1.EnvVar{
{Name: "foo", Value: "bar"},
},
},
DCGM: gpuv1.DCGMSpec{
Enabled: newBoolPtr(true),
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").
WithPodAnnotations(map[string]string{
"dcgm-exporter": "test",
"key": "value",
"app": "value",
"app.kubernetes.io/part-of": "value",
}),
},
{
description: "transform dcgm exporter only with common annotations",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
Daemonsets: gpuv1.DaemonsetsSpec{Annotations: map[string]string{
"key": "value",
"app": "value",
"app.kubernetes.io/part-of": "value",
}},
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
Env: []gpuv1.EnvVar{
{Name: "foo", Value: "bar"},
},
},
DCGM: gpuv1.DCGMSpec{
Enabled: newBoolPtr(true),
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").
WithPodAnnotations(map[string]string{
"key": "value",
"app": "value",
"app.kubernetes.io/part-of": "value",
}),
},
}

for _, tc := range testCases {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,14 @@ spec:
version:
description: NVIDIA DCGM Exporter image tag
type: string
annotations:
additionalProperties:
type: string
description: |-
Optional: Annotations is an unstructured key value map stored with a resource that may be
set by external tools to store and retrieve arbitrary metadata. They are not
queryable and should be preserved when modifying objects.
type: object
type: object
devicePlugin:
description: DevicePlugin component spec
Expand Down
3 changes: 3 additions & 0 deletions deployments/gpu-operator/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,9 @@ spec:
{{- if .Values.dcgmExporter.service }}
service: {{ toYaml .Values.dcgmExporter.service | nindent 6 }}
{{- end }}
{{- if .Values.dcgmExporter.annotations }}
annotations: {{ toYaml .Values.dcgmExporter.annotations | nindent 6 }}
{{- end }}
gfd:
enabled: {{ .Values.gfd.enabled }}
{{- if .Values.gfd.repository }}
Expand Down