Skip to content

Commit c814ad6

Browse files
Merge pull request #62 from castai/msokolowski.CID-482
feat: support global apiURL/provider from umbrella chart and clusterIdConfigMapKeyRef
2 parents 892c427 + 643186b commit c814ad6

File tree

9 files changed

+220
-15
lines changed

9 files changed

+220
-15
lines changed

charts/gpu-metrics-exporter/templates/_helpers.tpl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,12 @@ Create the name of the gpu-metrics-exporter config map
7373
*/}}
7474
{{- define "gpu-metrics-exporter.config-map" -}}
7575
{{- printf "%s-%s" .Release.Name "gpu-metrics-exporter" | replace "+" "_" | trunc 63 | trimSuffix "-" }}
76-
{{- end }}
76+
{{- end }}
77+
78+
{{- define "gpu-metrics-exporter.apiURL" -}}
79+
{{- coalesce (dig "castai" "apiURL" "" (.Values.global | default dict)) .Values.castai.apiUrl -}}
80+
{{- end }}
81+
82+
{{- define "gpu-metrics-exporter.provider" -}}
83+
{{- coalesce (dig "castai" "provider" "" (.Values.global | default dict)) .Values.provider -}}
84+
{{- end }}

charts/gpu-metrics-exporter/templates/daemonset.yaml

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ spec:
2929
imagePullSecrets:
3030
{{- toYaml . | nindent 8 }}
3131
{{- end }}
32-
{{- if eq (required ".Values.provider is required (gke|eks|aks|omni)" .Values.provider) "eks" }}
32+
{{- $provider := include "gpu-metrics-exporter.provider" . }}
33+
{{- if eq (required "provider or global.castai.provider is required (gke|eks|aks|omni)" $provider) "eks" }}
3334
priorityClassName: system-node-critical
3435
{{- end }}
3536
serviceAccountName: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
@@ -41,7 +42,7 @@ spec:
4142
- name: {{- include "dcgm-exporter.config-map" . | indent 1 }}
4243
configMap:
4344
name: {{- include "dcgm-exporter.config-map" . | indent 1 }}
44-
{{- if eq .Values.provider "gke" }}
45+
{{- if eq $provider "gke" }}
4546
- name: "nvidia-install-dir-host"
4647
hostPath:
4748
path: /home/kubernetes/bin/nvidia
@@ -56,7 +57,7 @@ spec:
5657
affinity:
5758
{{- toYaml . | nindent 8 }}
5859
{{- end }}
59-
{{- else if (eq .Values.provider "gke")}}
60+
{{- else if (eq $provider "gke")}}
6061
{{- with .Values.gke.affinity }}
6162
affinity:
6263
{{- toYaml . | nindent 8 }}
@@ -111,6 +112,12 @@ spec:
111112
secretKeyRef:
112113
name: {{ .Values.castai.clusterIdSecretRef }}
113114
key: CLUSTER_ID
115+
{{- else if .Values.castai.clusterIdConfigMapKeyRef.name }}
116+
- name: "CLUSTER_ID"
117+
valueFrom:
118+
configMapKeyRef:
119+
name: {{ .Values.castai.clusterIdConfigMapKeyRef.name }}
120+
key: {{ .Values.castai.clusterIdConfigMapKeyRef.key | default "CLUSTER_ID" }}
114121
{{- end }}
115122
{{- if .Values.dcgmExporter.enabled }}
116123
- name: "DCGM_HOST"
@@ -128,14 +135,14 @@ spec:
128135
- NET_RAW
129136
runAsNonRoot: false
130137
runAsUser: 0
131-
{{- if eq .Values.provider "gke"}}
138+
{{- if eq $provider "gke"}}
132139
privileged: true
133140
{{- end }}
134141
image: "{{ .Values.dcgmExporter.image.repository }}:{{ .Values.dcgmExporter.image.tag }}"
135142
imagePullPolicy: {{ .Values.dcgmExporter.image.pullPolicy }}
136143
command: [ "/bin/bash", "-c" ]
137144
args:
138-
{{- if eq .Values.provider "gke"}}
145+
{{- if eq $provider "gke"}}
139146
{{- if .Values.dcgmExporter.useExternalHostEngine }}
140147
- hostname $NODE_NAME; dcgm-exporter --remote-hostengine-info $(NODE_IP) -f /etc/dcgm-exporter/counters.csv
141148
{{- else }}
@@ -162,7 +169,7 @@ spec:
162169
valueFrom:
163170
fieldRef:
164171
fieldPath: spec.nodeName
165-
{{- if eq .Values.provider "gke" }}
172+
{{- if eq $provider "gke" }}
166173
- name: "NODE_IP"
167174
valueFrom:
168175
fieldRef:
@@ -176,7 +183,7 @@ spec:
176183
- name: "pod-gpu-resources"
177184
readOnly: true
178185
mountPath: "/var/lib/kubelet/pod-resources"
179-
{{- if eq .Values.provider "gke" }}
186+
{{- if eq $provider "gke" }}
180187
- name: "nvidia-install-dir-host"
181188
mountPath: /usr/local/nvidia
182189
{{- end }}

charts/gpu-metrics-exporter/templates/gpu-exporter-configmap.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ kind: ConfigMap
33
metadata:
44
name: {{- include "gpu-metrics-exporter.config-map" . | indent 1}}
55
data:
6-
{{- if not .Values.castai.clusterIdSecretRef }}
6+
{{- if and (not .Values.castai.clusterIdSecretRef) (not .Values.castai.clusterIdConfigMapKeyRef.name) }}
77
CLUSTER_ID: {{ required "castai.clusterId must be provided" .Values.castai.clusterId | quote }}
88
{{- end }}
9-
CAST_API: {{ required "castai.apiUrl must be provided" .Values.castai.apiUrl | quote }}
9+
CAST_API: {{ required "castai.apiUrl or global.castai.apiURL must be provided" (include "gpu-metrics-exporter.apiURL" .) | quote }}
1010

1111
{{- $config := .Values.gpuMetricsExporter.config | default dict }}
1212
{{- $otherConfig := omit $config "CLUSTER_ID" "CAST_API"}}

charts/gpu-metrics-exporter/templates/validation.yaml

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,25 @@
22
Validation for provider parameter
33
*/}}
44
{{- $validProviders := list "gke" "eks" "aks" "omni" }}
5-
{{- if and .Values.provider (not (has .Values.provider $validProviders)) }}
6-
{{- fail (printf "Invalid provider '%s'. Must be one of: %s" .Values.provider (join " | " $validProviders)) }}
5+
{{- $provider := include "gpu-metrics-exporter.provider" . }}
6+
{{- if and $provider (not (has $provider $validProviders)) }}
7+
{{- fail (printf "Invalid provider '%s'. Must be one of: %s" $provider (join " | " $validProviders)) }}
78
{{- end }}
89

910
{{/*
10-
Require either clusterId or clusterIdSecretRef
11+
Require one of clusterId, clusterIdSecretRef, or clusterIdConfigMapKeyRef
1112
*/}}
12-
{{- if and (not .Values.castai.clusterId) (not .Values.castai.clusterIdSecretRef) }}
13-
{{- fail "one of castai.clusterId or castai.clusterIdSecretRef must be set" }}
13+
{{- if and (not .Values.castai.clusterId) (not .Values.castai.clusterIdSecretRef) (not .Values.castai.clusterIdConfigMapKeyRef.name) }}
14+
{{- fail "one of castai.clusterId, castai.clusterIdSecretRef, or castai.clusterIdConfigMapKeyRef must be set" }}
15+
{{- end }}
16+
17+
{{/*
18+
Ensure mutual exclusivity of cluster ID sources
19+
*/}}
20+
{{- $clusterIdCount := 0 }}
21+
{{- if .Values.castai.clusterId }}{{ $clusterIdCount = add $clusterIdCount 1 }}{{ end }}
22+
{{- if .Values.castai.clusterIdSecretRef }}{{ $clusterIdCount = add $clusterIdCount 1 }}{{ end }}
23+
{{- if .Values.castai.clusterIdConfigMapKeyRef.name }}{{ $clusterIdCount = add $clusterIdCount 1 }}{{ end }}
24+
{{- if gt (int $clusterIdCount) 1 }}
25+
{{- fail "only one of castai.clusterId, castai.clusterIdSecretRef, or castai.clusterIdConfigMapKeyRef may be set" }}
1426
{{- end }}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
suite: clusterIdConfigMapKeyRef wires CLUSTER_ID from external ConfigMap
2+
templates:
3+
- gpu-exporter-configmap.yaml
4+
- secret.yaml
5+
- daemonset.yaml
6+
set:
7+
castai.apiKeySecretRef: castai-credentials
8+
castai.clusterIdConfigMapKeyRef.name: castai-agent-metadata
9+
castai.clusterIdConfigMapKeyRef.key: CLUSTER_ID
10+
global.castai.provider: gke
11+
tests:
12+
- it: gpu-exporter ConfigMap omits CLUSTER_ID key
13+
template: gpu-exporter-configmap.yaml
14+
asserts:
15+
- notExists:
16+
path: data.CLUSTER_ID
17+
18+
- it: DaemonSet has CLUSTER_ID env var sourced from configMapKeyRef
19+
template: daemonset.yaml
20+
asserts:
21+
- contains:
22+
path: spec.template.spec.containers[0].env
23+
content:
24+
name: "CLUSTER_ID"
25+
valueFrom:
26+
configMapKeyRef:
27+
name: castai-agent-metadata
28+
key: CLUSTER_ID
29+
30+
- it: No gpu-metrics-exporter Secret is rendered (apiKeySecretRef is used instead)
31+
template: secret.yaml
32+
asserts:
33+
- hasDocuments:
34+
count: 0
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
suite: global values take precedence over local values
2+
templates:
3+
- gpu-exporter-configmap.yaml
4+
- secret.yaml
5+
- daemonset.yaml
6+
set:
7+
global.castai.apiKey: "global-api-key"
8+
global.castai.apiURL: "https://global.api.cast.ai"
9+
global.castai.provider: "gke"
10+
global.castai.clusterID: "global-cluster-id"
11+
castai.apiKey: "local-api-key"
12+
castai.apiUrl: "https://local.api.cast.ai"
13+
provider: "eks"
14+
castai.clusterId: "local-cluster-id"
15+
tests:
16+
- it: ConfigMap uses global CAST_API not local
17+
template: gpu-exporter-configmap.yaml
18+
asserts:
19+
- equal:
20+
path: data.CAST_API
21+
value: "https://global.api.cast.ai"
22+
23+
- it: ConfigMap uses local CLUSTER_ID (global.castai.clusterID is not supported)
24+
template: gpu-exporter-configmap.yaml
25+
asserts:
26+
- equal:
27+
path: data.CLUSTER_ID
28+
value: "local-cluster-id"
29+
30+
- it: Secret uses local API key (global.castai.apiKey is not supported)
31+
template: secret.yaml
32+
asserts:
33+
- equal:
34+
path: data.API_KEY
35+
value: "bG9jYWwtYXBpLWtleQ=="
36+
37+
- it: DaemonSet uses global provider gke so no priorityClassName
38+
template: daemonset.yaml
39+
asserts:
40+
- notExists:
41+
path: spec.template.spec.priorityClassName
42+
43+
- it: DaemonSet uses global provider gke so nvidia volume is present
44+
template: daemonset.yaml
45+
asserts:
46+
- contains:
47+
path: spec.template.spec.volumes
48+
content:
49+
name: "nvidia-install-dir-host"
50+
hostPath:
51+
path: /home/kubernetes/bin/nvidia
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
suite: local values work when globals are absent
2+
templates:
3+
- gpu-exporter-configmap.yaml
4+
- secret.yaml
5+
- daemonset.yaml
6+
set:
7+
castai.apiKey: "local-api-key"
8+
castai.apiUrl: "https://local.api.cast.ai"
9+
provider: "eks"
10+
castai.clusterId: "local-cluster-id"
11+
tests:
12+
- it: ConfigMap uses local CAST_API
13+
template: gpu-exporter-configmap.yaml
14+
asserts:
15+
- equal:
16+
path: data.CAST_API
17+
value: "https://local.api.cast.ai"
18+
19+
- it: ConfigMap uses local CLUSTER_ID
20+
template: gpu-exporter-configmap.yaml
21+
asserts:
22+
- equal:
23+
path: data.CLUSTER_ID
24+
value: "local-cluster-id"
25+
26+
- it: Secret uses local API key
27+
template: secret.yaml
28+
asserts:
29+
- equal:
30+
path: data.API_KEY
31+
value: "bG9jYWwtYXBpLWtleQ=="
32+
33+
- it: DaemonSet uses local provider eks so priorityClassName is set
34+
template: daemonset.yaml
35+
asserts:
36+
- equal:
37+
path: spec.template.spec.priorityClassName
38+
value: "system-node-critical"
39+
40+
- it: DaemonSet uses local provider eks so nvidia volume is absent
41+
template: daemonset.yaml
42+
asserts:
43+
- notContains:
44+
path: spec.template.spec.volumes
45+
content:
46+
name: "nvidia-install-dir-host"
47+
hostPath:
48+
path: /home/kubernetes/bin/nvidia
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
suite: mutual exclusivity of cluster ID sources
2+
templates:
3+
- validation.yaml
4+
set:
5+
castai.apiKey: "test-key"
6+
castai.apiUrl: "https://api.cast.ai"
7+
provider: "gke"
8+
tests:
9+
- it: fails when both clusterIdSecretRef and clusterIdConfigMapKeyRef are set
10+
set:
11+
castai.clusterIdSecretRef: "my-secret"
12+
castai.clusterIdConfigMapKeyRef.name: "my-configmap"
13+
castai.clusterIdConfigMapKeyRef.key: "CLUSTER_ID"
14+
asserts:
15+
- failedTemplate:
16+
errorMessage: "only one of castai.clusterId, castai.clusterIdSecretRef, or castai.clusterIdConfigMapKeyRef may be set"
17+
18+
- it: fails when both clusterId and clusterIdSecretRef are set
19+
set:
20+
castai.clusterId: "my-cluster"
21+
castai.clusterIdSecretRef: "my-secret"
22+
asserts:
23+
- failedTemplate:
24+
errorMessage: "only one of castai.clusterId, castai.clusterIdSecretRef, or castai.clusterIdConfigMapKeyRef may be set"
25+
26+
- it: fails when both clusterId and clusterIdConfigMapKeyRef are set
27+
set:
28+
castai.clusterId: "my-cluster"
29+
castai.clusterIdConfigMapKeyRef.name: "my-configmap"
30+
castai.clusterIdConfigMapKeyRef.key: "CLUSTER_ID"
31+
asserts:
32+
- failedTemplate:
33+
errorMessage: "only one of castai.clusterId, castai.clusterIdSecretRef, or castai.clusterIdConfigMapKeyRef may be set"

charts/gpu-metrics-exporter/values.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
global:
2+
castai:
3+
apiURL: ""
4+
provider: ""
5+
16
provider: "" # gke | eks | aks | omni
27
imagePullSecrets: [ ]
38

@@ -21,6 +26,13 @@ castai:
2126
# The referenced secret must provide the ID of the cluster in .data["CLUSTER_ID"]
2227
# clusterId and clusterIdSecretRef are mutually exclusive
2328
clusterIdSecretRef:
29+
30+
# Name of ConfigMap key ref for CLUSTER_ID.
31+
# clusterIdConfigMapKeyRef and clusterIdSecretRef and clusterId are mutually exclusive.
32+
# The referenced ConfigMap must provide the cluster ID in the specified key.
33+
clusterIdConfigMapKeyRef:
34+
name: ""
35+
key: "CLUSTER_ID"
2436

2537
# CASTAI public api url.
2638
apiUrl: "https://api.cast.ai"

0 commit comments

Comments
 (0)