Skip to content

Commit 887f799

Browse files
authored
Merge branch 'main' into xrfxlp/1402/feat
2 parents 361ab43 + 28ea6ed commit 887f799

20 files changed

Lines changed: 1695 additions & 65 deletions

File tree

distros/kubernetes/nvsentinel/charts/labeler/templates/_helpers.tpl

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,45 @@ Selector labels
3838
app.kubernetes.io/name: {{ include "labeler.name" . }}
3939
app.kubernetes.io/instance: {{ .Release.Name }}
4040
{{- end }}
41+
42+
{{/*
43+
ConfigMap name.
44+
*/}}
45+
{{- define "labeler.configMapName" -}}
46+
{{ include "labeler.fullname" . }}
47+
{{- end }}
48+
49+
{{/*
50+
Expected device-count configuration content.
51+
*/}}
52+
{{- define "labeler.expectedDeviceCountsConfig" -}}
53+
enabled = {{ .Values.expectedDeviceCounts.enabled }}
54+
{{- range .Values.expectedDeviceCounts.classes }}
55+
56+
[[classes]]
57+
name = {{ .name | quote }}
58+
enabled = {{ .enabled }}
59+
{{- with .groupingLabels }}
60+
groupingLabels = [{{- range $i, $label := . }}{{ if $i }}, {{ end }}{{ $label | quote }}{{- end }}]
61+
{{- end }}
62+
currentExpression = '''
63+
{{ trimSuffix "\n" (default "" .currentExpression) }}
64+
'''
65+
66+
[classes.labels]
67+
current = {{ .labels.current | quote }}
68+
expected = {{ .labels.expected | quote }}
69+
{{- range .expectedCountOverrides }}
70+
71+
[[classes.expectedCountOverrides]]
72+
count = {{ .count }}
73+
{{- with .matchLabels }}
74+
75+
[classes.expectedCountOverrides.matchLabels]
76+
{{- range $key, $value := . }}
77+
{{ $key | quote }} = {{ $value | quote }}
78+
{{- end }}
79+
{{- end }}
80+
{{- end }}
81+
{{- end }}
82+
{{- end }}

distros/kubernetes/nvsentinel/charts/labeler/templates/clusterrole.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,11 @@ rules:
3737
- watch
3838
- patch
3939
- update
40+
- apiGroups:
41+
- resource.k8s.io
42+
resources:
43+
- resourceslices
44+
verbs:
45+
- get
46+
- list
47+
- watch
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
{{- if .Values.expectedDeviceCounts.enabled }}
16+
apiVersion: v1
17+
kind: ConfigMap
18+
metadata:
19+
name: {{ include "labeler.configMapName" . }}
20+
labels:
21+
{{- include "labeler.labels" . | nindent 4 }}
22+
data:
23+
expected-device-counts.toml: |
24+
{{ include "labeler.expectedDeviceCountsConfig" . | indent 4 }}
25+
{{- end }}

distros/kubernetes/nvsentinel/charts/labeler/templates/deployment.yaml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,15 @@ spec:
2727
{{- include "labeler.selectorLabels" . | nindent 6 }}
2828
template:
2929
metadata:
30-
{{- with .Values.podAnnotations }}
30+
{{- if or .Values.podAnnotations .Values.expectedDeviceCounts.enabled }}
3131
annotations:
32+
{{- if .Values.expectedDeviceCounts.enabled }}
33+
checksum/expected-device-counts-config: {{ include "labeler.expectedDeviceCountsConfig" . | sha256sum }}
34+
{{- end }}
35+
{{- with .Values.podAnnotations }}
3236
{{- toYaml . | nindent 8 }}
3337
{{- end }}
38+
{{- end }}
3439
labels:
3540
{{- include "labeler.labels" . | nindent 8 }}
3641
{{- with .Values.podLabels }}
@@ -64,6 +69,10 @@ spec:
6469
{{- if .Values.assumeDriverInstalled }}
6570
- "--assume-driver-installed"
6671
{{- end }}
72+
{{- if .Values.expectedDeviceCounts.enabled }}
73+
- "--expected-device-counts-config-file"
74+
- "/etc/nvsentinel/labeler/expected-device-counts.toml"
75+
{{- end }}
6776
resources:
6877
{{- toYaml .Values.resources | nindent 12 }}
6978
ports:
@@ -85,9 +94,17 @@ spec:
8594
periodSeconds: 10
8695
timeoutSeconds: 3
8796
failureThreshold: 3
88-
{{- if .Values.global.auditLogging.enabled }}
97+
{{- if or .Values.global.auditLogging.enabled .Values.expectedDeviceCounts.enabled }}
8998
volumeMounts:
99+
{{- if .Values.expectedDeviceCounts.enabled }}
100+
- name: expected-device-counts-config
101+
mountPath: /etc/nvsentinel/labeler/expected-device-counts.toml
102+
subPath: expected-device-counts.toml
103+
readOnly: true
104+
{{- end }}
105+
{{- if .Values.global.auditLogging.enabled }}
90106
{{- include "nvsentinel.auditLogging.volumeMount" . | nindent 12 }}
107+
{{- end }}
91108
{{- end }}
92109
env:
93110
- name: POD_NAME
@@ -99,9 +116,16 @@ spec:
99116
{{- if .Values.global.auditLogging.enabled }}
100117
{{- include "nvsentinel.auditLogging.envVars" . | nindent 12 }}
101118
{{- end }}
102-
{{- if .Values.global.auditLogging.enabled }}
119+
{{- if or .Values.global.auditLogging.enabled .Values.expectedDeviceCounts.enabled }}
103120
volumes:
121+
{{- if .Values.expectedDeviceCounts.enabled }}
122+
- name: expected-device-counts-config
123+
configMap:
124+
name: {{ include "labeler.configMapName" . }}
125+
{{- end }}
126+
{{- if .Values.global.auditLogging.enabled }}
104127
{{- include "nvsentinel.auditLogging.volume" . | nindent 8 }}
128+
{{- end }}
105129
{{- end }}
106130
{{- with (.Values.global.systemNodeSelector | default .Values.nodeSelector) }}
107131
nodeSelector:

distros/kubernetes/nvsentinel/charts/labeler/values.yaml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,60 @@ kataLabelOverride: ""
4242
# are installed directly on the host rather than via GPU Operator driver containers.
4343
assumeDriverInstalled: false
4444

45+
# Expected device count labels
46+
# When enabled, the labeler writes current/expected device-count labels per configured class.
47+
# Current counts come from CEL expressions over the node object and associated ResourceSlices.
48+
# Expected counts are learned per grouping-label partition unless an override matches.
49+
expectedDeviceCounts:
50+
enabled: true
51+
classes:
52+
- name: gpu
53+
enabled: true
54+
labels:
55+
current: nvsentinel.dgxc.nvidia.com/gpu.count.current
56+
expected: nvsentinel.dgxc.nvidia.com/gpu.count.expected
57+
groupingLabels:
58+
- node.kubernetes.io/instance-type
59+
- nvidia.com/gpu.product
60+
expectedCountOverrides: []
61+
currentExpression: |
62+
int(node.metadata.labels['nvidia.com/gpu.count'])
63+
- name: nic
64+
enabled: true
65+
labels:
66+
current: nvsentinel.dgxc.nvidia.com/nic.count.current
67+
expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
68+
groupingLabels:
69+
- node.kubernetes.io/instance-type
70+
expectedCountOverrides: []
71+
currentExpression: |
72+
int(node.status.allocatable['nvidia.com/mlnxnics'])
73+
# Example AWS RoCE DRA NIC class. Uncomment and adapt per environment.
74+
# - name: nic
75+
# enabled: false
76+
# labels:
77+
# current: nvsentinel.dgxc.nvidia.com/nic.count.current
78+
# expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
79+
# groupingLabels:
80+
# - node.kubernetes.io/instance-type
81+
# expectedCountOverrides: []
82+
# currentExpression: |
83+
# sum(resourceSlices
84+
# .filter(rs,
85+
# has(rs.spec.driver) &&
86+
# rs.spec.driver == 'dra.networking.k8s.aws' &&
87+
# has(rs.spec.devices)
88+
# )
89+
# .map(rs, rs.spec.devices
90+
# .filter(d,
91+
# has(d.attributes) &&
92+
# 'dra.vpc.amazonaws.com/deviceType' in d.attributes &&
93+
# has(d.attributes['dra.vpc.amazonaws.com/deviceType'].string) &&
94+
# d.attributes['dra.vpc.amazonaws.com/deviceType'].string == 'roce'
95+
# )
96+
# .size()
97+
# ))
98+
4599
resources:
46100
requests:
47101
cpu: 100m

distros/kubernetes/nvsentinel/values-full.yaml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,63 @@ labeler:
954954
# Leave empty to only check default label
955955
kataLabelOverride: ""
956956

957+
# Pre-installed driver configuration
958+
# When enabled, the labeler sets nvsentinel.dgxc.nvidia.com/driver.installed=true
959+
# on all nodes with nvidia.com/gpu.present=true, skipping driver pod detection.
960+
assumeDriverInstalled: false
961+
962+
# Expected device count labels
963+
# Writes current/expected device-count labels from node labels or DRA ResourceSlice data.
964+
expectedDeviceCounts:
965+
enabled: true
966+
classes:
967+
- name: gpu
968+
enabled: true
969+
labels:
970+
current: nvsentinel.dgxc.nvidia.com/gpu.count.current
971+
expected: nvsentinel.dgxc.nvidia.com/gpu.count.expected
972+
groupingLabels:
973+
- node.kubernetes.io/instance-type
974+
- nvidia.com/gpu.product
975+
expectedCountOverrides: []
976+
currentExpression: |
977+
int(node.metadata.labels['nvidia.com/gpu.count'])
978+
- name: nic
979+
enabled: true
980+
labels:
981+
current: nvsentinel.dgxc.nvidia.com/nic.count.current
982+
expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
983+
groupingLabels:
984+
- node.kubernetes.io/instance-type
985+
expectedCountOverrides: []
986+
currentExpression: |
987+
int(node.status.allocatable['nvidia.com/mlnxnics'])
988+
# Example AWS RoCE DRA NIC class. Uncomment and adapt per environment.
989+
# - name: nic
990+
# enabled: false
991+
# labels:
992+
# current: nvsentinel.dgxc.nvidia.com/nic.count.current
993+
# expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
994+
# groupingLabels:
995+
# - node.kubernetes.io/instance-type
996+
# expectedCountOverrides: []
997+
# currentExpression: |
998+
# sum(resourceSlices
999+
# .filter(rs,
1000+
# has(rs.spec.driver) &&
1001+
# rs.spec.driver == 'dra.networking.k8s.aws' &&
1002+
# has(rs.spec.devices)
1003+
# )
1004+
# .map(rs, rs.spec.devices
1005+
# .filter(d,
1006+
# has(d.attributes) &&
1007+
# 'dra.vpc.amazonaws.com/deviceType' in d.attributes &&
1008+
# has(d.attributes['dra.vpc.amazonaws.com/deviceType'].string) &&
1009+
# d.attributes['dra.vpc.amazonaws.com/deviceType'].string == 'roce'
1010+
# )
1011+
# .size()
1012+
# ))
1013+
9571014
# Pod resource limits and requests
9581015
resources:
9591016
requests:

docs/configuration/labeler.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ The labeler automatically manages these node labels:
1313
| `nvsentinel.dgxc.nvidia.com/dcgm.version` | `3.x`, `4.x` | DCGM major version detected from DCGM pods |
1414
| `nvsentinel.dgxc.nvidia.com/driver.installed` | `true`, `false` | NVIDIA driver pod status on node |
1515
| `nvsentinel.dgxc.nvidia.com/kata.enabled` | `true`, `false` | Kata Containers runtime presence |
16+
| `nvsentinel.dgxc.nvidia.com/gpu.count.current` | non-negative integer | Current GPU count from the configured class expression |
17+
| `nvsentinel.dgxc.nvidia.com/gpu.count.expected` | non-negative integer | Expected GPU count from override or learned hardware-class baseline |
18+
| `nvsentinel.dgxc.nvidia.com/nic.count.current` | non-negative integer | Current NIC count from the configured class expression |
19+
| `nvsentinel.dgxc.nvidia.com/nic.count.expected` | non-negative integer | Expected NIC count from override or learned hardware-class baseline |
1620

1721
## Configuration Reference
1822

@@ -79,6 +83,41 @@ The following label values (case-insensitive) are considered truthy for Kata det
7983

8084
Any other value or missing label results in `kata.enabled=false`.
8185

86+
## Expected Device Counts
87+
88+
Expected device-count labeling is disabled by default. When enabled, the labeler evaluates enabled classes and writes current/expected count labels only when the configured CEL expression returns a valid non-negative integer.
89+
90+
The Helm chart renders this values block into a TOML ConfigMap entry and mounts it into the labeler pod. Because expressions are compiled at startup, Helm also annotates the pod template with a checksum so changes to the ConfigMap roll the Deployment.
91+
92+
```yaml
93+
labeler:
94+
expectedDeviceCounts:
95+
enabled: true
96+
classes:
97+
- name: gpu
98+
enabled: true
99+
labels:
100+
current: nvsentinel.dgxc.nvidia.com/gpu.count.current
101+
expected: nvsentinel.dgxc.nvidia.com/gpu.count.expected
102+
groupingLabels:
103+
- node.kubernetes.io/instance-type
104+
- nvidia.com/gpu.product
105+
expectedCountOverrides:
106+
- matchLabels:
107+
nvidia.com/gpu.product: NVIDIA-GB200
108+
count: 8
109+
currentExpression: |
110+
int(node.metadata.labels['nvidia.com/gpu.count'])
111+
```
112+
113+
The CEL context exposes:
114+
115+
- `node`: the Kubernetes Node object being reconciled.
116+
- `resourceSlices`: ResourceSlice objects associated with the node.
117+
- `sum(list<int>)`: helper that returns the sum of a list of integers.
118+
119+
For classes without a matching override, the expected value is learned as the maximum current or existing expected count among nodes with the same configured grouping-label values. Learned expected counts can rise automatically, but do not fall automatically when a node reports fewer devices.
120+
82121
### Kata Detection Examples
83122

84123
#### Example 1: Default Detection

docs/labeler.md

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@ The Labeler runs as a deployment in the cluster:
3131
2. When pods start on a node, examines container images to extract versions
3232
3. Updates node labels with detected versions
3333
4. Watches node labels to detect Kata Container runtime
34-
5. NVSentinel components read these labels and configure themselves accordingly
35-
6. Continuously keeps labels synchronized as infrastructure changes
34+
5. Optionally evaluates configured device-count classes and labels current/expected GPU or NIC counts
35+
6. NVSentinel components read these labels and configure themselves accordingly
36+
7. Continuously keeps labels synchronized as infrastructure changes
3637

3738
For example:
3839
- GPU Health Monitor uses the DCGM version label to select the correct DCGM API version
@@ -51,12 +52,17 @@ labeler:
5152

5253
# Optional: Override the default Kata Containers detection label
5354
kataLabelOverride: "" # Custom label to check for Kata runtime
55+
56+
# Optional: Enable current/expected device-count labels
57+
expectedDeviceCounts:
58+
enabled: false
5459
```
5560
5661
### Configuration Options
5762
5863
- **Log Level**: Control logging verbosity (info, debug, warn, error)
5964
- **Kata Label Override**: Specify additional node label to check for Kata Container detection
65+
- **Expected Device Counts**: Configure device-count classes that derive current and expected count labels from node labels or DRA ResourceSlices
6066
6167
## Labels Applied
6268
@@ -90,6 +96,17 @@ kubectl label nodes <node-name> nvsentinel.dgxc.nvidia.com/driver.installed=true
9096

9197
Indicates whether the node is running Kata Containers runtime (detected from node labels).
9298

99+
### Expected Device Counts
100+
**Labels**:
101+
- `nvsentinel.dgxc.nvidia.com/gpu.count.current`
102+
- `nvsentinel.dgxc.nvidia.com/gpu.count.expected`
103+
- `nvsentinel.dgxc.nvidia.com/nic.count.current`
104+
- `nvsentinel.dgxc.nvidia.com/nic.count.expected`
105+
106+
**Values**: non-negative integer strings
107+
108+
When enabled, the labeler evaluates configured CEL expressions against the node and associated DRA ResourceSlices. Current labels reflect the observed count. Expected labels come from an override or the maximum learned count among nodes in the same grouping-label partition.
109+
93110
## Key Features
94111

95112
### Self-Configuration

0 commit comments

Comments
 (0)