Skip to content

Commit 9e98d6e

Browse files
authored
fix: make preflight chart consistent with other subcharts (#1114)
Signed-off-by: Ajay Mishra <ajmishra@nvidia.com>
1 parent 4515846 commit 9e98d6e

File tree

7 files changed

+30
-31
lines changed

7 files changed

+30
-31
lines changed

distros/kubernetes/nvsentinel/charts/preflight/templates/_helpers.tpl

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,14 @@ limitations under the License.
1818
Expand the name of the chart.
1919
*/}}
2020
{{- define "preflight.name" -}}
21-
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
21+
{{- .Chart.Name | trunc 63 | trimSuffix "-" }}
2222
{{- end }}
2323

2424
{{/*
2525
Create a default fully qualified app name.
2626
*/}}
2727
{{- define "preflight.fullname" -}}
28-
{{- if .Values.fullnameOverride }}
29-
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
30-
{{- else }}
31-
{{- $name := default .Chart.Name .Values.nameOverride }}
32-
{{- if contains $name .Release.Name }}
33-
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
34-
{{- else }}
35-
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
36-
{{- end }}
37-
{{- end }}
28+
{{- "preflight" | trunc 63 | trimSuffix "-" }}
3829
{{- end }}
3930

4031
{{/*

distros/kubernetes/nvsentinel/charts/preflight/templates/configmap.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
apiVersion: v1
1616
kind: ConfigMap
1717
metadata:
18-
name: {{ include "preflight.fullname" . }}-config
18+
name: {{ include "preflight.fullname" . }}
1919
namespace: {{ .Release.Namespace }}
2020
labels:
2121
{{- include "preflight.labels" . | nindent 4 }}
@@ -75,14 +75,16 @@ data:
7575
{{- end }}
7676
initContainers:
7777
{{- $extraEnv := .Values.ncclAllreduceExtraEnv | default list }}
78+
{{- $globalTag := ((.Values.global).image).tag | default "" }}
79+
{{- $appVersion := .Chart.AppVersion }}
7880
{{- $result := list }}
7981
{{- range .Values.initContainers }}
80-
{{- if and (eq .name "preflight-nccl-allreduce") $extraEnv }}
8182
{{- $container := deepCopy . }}
83+
{{- $tag := $container.image.tag | default $globalTag | default $appVersion }}
84+
{{- $_ := set $container "image" (printf "%s:%s" $container.image.repository $tag) }}
85+
{{- if and (eq .name "preflight-nccl-allreduce") $extraEnv }}
8286
{{- $_ := set $container "env" (concat (.env | default list) $extraEnv) }}
83-
{{- $result = append $result $container }}
84-
{{- else }}
85-
{{- $result = append $result . }}
8687
{{- end }}
88+
{{- $result = append $result $container }}
8789
{{- end }}
8890
{{- toYaml $result | nindent 6 }}

distros/kubernetes/nvsentinel/charts/preflight/templates/deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ spec:
3636
serviceAccountName: {{ include "preflight.serviceAccountName" . }}
3737
containers:
3838
- name: {{ .Chart.Name }}
39-
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
39+
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}"
4040
imagePullPolicy: {{ .Values.image.pullPolicy }}
4141
args:
4242
- --port={{ .Values.webhook.port }}
@@ -67,7 +67,7 @@ spec:
6767
secretName: {{ include "preflight.certSecretName" . }}
6868
- name: config
6969
configMap:
70-
name: {{ include "preflight.fullname" . }}-config
70+
name: {{ include "preflight.fullname" . }}
7171
{{- with .Values.nodeSelector }}
7272
nodeSelector:
7373
{{- toYaml . | nindent 8 }}

distros/kubernetes/nvsentinel/charts/preflight/values.yaml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,9 @@ replicaCount: 1
1717
image:
1818
repository: ghcr.io/nvidia/nvsentinel/preflight
1919
pullPolicy: IfNotPresent
20-
tag: "latest"
20+
tag: ""
2121

2222
imagePullSecrets: []
23-
nameOverride: ""
24-
fullnameOverride: ""
2523

2624
serviceAccount:
2725
create: true
@@ -103,7 +101,9 @@ dcgm:
103101

104102
initContainers:
105103
- name: preflight-dcgm-diag
106-
image: ghcr.io/nvidia/nvsentinel/preflight-dcgm-diag:latest
104+
image:
105+
repository: ghcr.io/nvidia/nvsentinel/preflight-dcgm-diag
106+
tag: ""
107107
volumeMounts:
108108
- name: nvsentinel-socket
109109
mountPath: /var/run
@@ -112,9 +112,11 @@ initContainers:
112112
# limits:
113113
# memory: 512Mi
114114

115-
# # NCCL loopback test - validates intra-node GPU-to-GPU communication (NVLink/PCIe)
115+
# NCCL loopback test - validates intra-node GPU-to-GPU communication (NVLink/PCIe)
116116
- name: preflight-nccl-loopback
117-
image: ghcr.io/nvidia/nvsentinel/preflight-nccl-loopback:latest
117+
image:
118+
repository: ghcr.io/nvidia/nvsentinel/preflight-nccl-loopback
119+
tag: ""
118120
env:
119121
- name: BW_THRESHOLD_GBPS
120122
# Minimum acceptable bus bandwidth
@@ -135,7 +137,9 @@ initContainers:
135137
# NCCL all-reduce test - validates multi-node GPU communication
136138
# Requires gangCoordination.enabled=true and gang-aware scheduler (Volcano, Kueue, etc.)
137139
- name: preflight-nccl-allreduce
138-
image: ghcr.io/nvidia/nvsentinel/preflight-nccl-allreduce:latest
140+
image:
141+
repository: ghcr.io/nvidia/nvsentinel/preflight-nccl-allreduce
142+
tag: ""
139143
securityContext:
140144
capabilities:
141145
add: ["IPC_LOCK"] # Required for RDMA memory registration

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,9 @@ preflight:
121121
diagLevel: 3
122122
initContainers:
123123
- name: preflight-dcgm-diag
124-
image: busybox:latest
124+
image:
125+
repository: busybox
126+
tag: latest
125127
command: ["/bin/sh", "-c"]
126128
args: ["echo 'preflight-dcgm-diag: no real GPU, exiting ok'; exit 0"]
127129

tests/helpers/preflight.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ const (
3434
PreflightNamespaceLabel = "nvsentinel.nvidia.com/preflight"
3535
PreflightNamespaceLabelVal = "enabled"
3636
PreflightDCGMDiagName = "preflight-dcgm-diag"
37-
PreflightConfigMapName = "nvsentinel-preflight-config"
37+
PreflightConfigMapName = "preflight"
3838
PreflightConfigKey = "config.yaml"
3939

4040
GangConfigMapLabelManagedBy = "nvsentinel.nvidia.com/managed-by"

tilt/Tiltfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,10 @@ cert_manager_objects = [
229229
]
230230
if preflight_enabled:
231231
cert_manager_objects.extend([
232-
'nvsentinel-preflight-webhook-cert:certificate',
233-
'nvsentinel-preflight-ca:certificate',
234-
'nvsentinel-preflight-selfsigned:issuer',
235-
'nvsentinel-preflight-ca-issuer:issuer',
232+
'preflight-webhook-cert:certificate',
233+
'preflight-ca:certificate',
234+
'preflight-selfsigned:issuer',
235+
'preflight-ca-issuer:issuer',
236236
])
237237
if use_postgresql:
238238
cert_manager_objects.extend([

0 commit comments

Comments
 (0)