Skip to content

Commit 171432f

Browse files
committed
feature: update metadata-collector to track the GPUs allocated to each pod
Signed-off-by: Nathan Herz <[email protected]>
1 parent fd4466e commit 171432f

File tree

17 files changed

+2090
-55
lines changed

17 files changed

+2090
-55
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package model
16+
17+
const (
18+
PodDeviceAnnotationName = "dgxc.nvidia.com/devices"
19+
)
20+
21+
var (
22+
EntityTypeToResourceNames = map[string][]string{
23+
"GPU_UUID": {
24+
"nvidia.com/gpu",
25+
"nvidia.com/pgpu",
26+
},
27+
}
28+
)
29+
30+
type DeviceAnnotation struct {
31+
Devices map[string][]string `json:"devices"`
32+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRole
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}
21+
rules:
22+
- apiGroups:
23+
- ""
24+
resources:
25+
- pods
26+
verbs:
27+
- patch
28+
- apiGroups:
29+
- ""
30+
resources:
31+
- nodes/proxy
32+
verbs:
33+
- get
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRoleBinding
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}
21+
roleRef:
22+
apiGroup: rbac.authorization.k8s.io
23+
kind: ClusterRole
24+
name: {{ include "metadata-collector.fullname" . }}
25+
subjects:
26+
- kind: ServiceAccount
27+
name: {{ include "metadata-collector.fullname" . }}
28+
namespace: {{ .Release.Namespace }}

distros/kubernetes/nvsentinel/charts/metadata-collector/templates/daemonset.yaml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,14 @@ spec:
3535
labels:
3636
{{- include "metadata-collector.selectorLabels" . | nindent 8 }}
3737
spec:
38+
serviceAccountName: {{ include "metadata-collector.fullname" . }}
3839
{{- with .Values.global.imagePullSecrets }}
3940
imagePullSecrets:
4041
{{- toYaml . | nindent 8 }}
4142
{{- end }}
4243
hostNetwork: true
4344
hostPID: true
44-
initContainers:
45+
containers:
4546
- name: metadata-collector
4647
securityContext:
4748
runAsUser: 0
@@ -65,10 +66,9 @@ spec:
6566
- name: sys
6667
mountPath: /sys
6768
readOnly: true
68-
containers:
69-
- name: pause
70-
image: "{{ .Values.pauseImage.repository }}:{{ .Values.pauseImage.tag }}"
71-
imagePullPolicy: {{ .Values.image.pullPolicy }}
69+
- mountPath: /var/lib/kubelet/pod-resources
70+
name: pod-gpu-resources
71+
readOnly: true
7272
volumes:
7373
- name: output
7474
hostPath:
@@ -78,6 +78,10 @@ spec:
7878
hostPath:
7979
path: /sys
8080
type: Directory
81+
- name: pod-gpu-resources
82+
hostPath:
83+
path: /var/lib/kubelet/pod-resources
84+
type: Directory
8185
nodeSelector:
8286
nvidia.com/gpu.present: "true"
8387
nvsentinel.dgxc.nvidia.com/driver.installed: "true"
@@ -94,4 +98,4 @@ spec:
9498
{{- end }}
9599
{{- if .Values.runtimeClassName }}
96100
runtimeClassName: {{ .Values.runtimeClassName }}
97-
{{- end }}
101+
{{- end }}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ServiceAccount
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}

distros/kubernetes/nvsentinel/charts/metadata-collector/values.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@ image:
1717
pullPolicy: IfNotPresent
1818
tag: ""
1919

20-
pauseImage:
21-
repository: registry.k8s.io/pause
22-
tag: "3.10"
23-
2420
podAnnotations: {}
2521

2622
resources:

docs/configuration/metadata-collector.md

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## Overview
44

5-
The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This document covers all Helm configuration options for system administrators.
5+
The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This component will also expose the pod to GPU mapping as an annotation on each pod requesting GPUs. This document covers all Helm configuration options for system administrators.
66

77
## Configuration Reference
88

@@ -50,22 +50,3 @@ Runtime class name that provides GPU device access. Required for NVML to query G
5050
- `nvidia` - NVIDIA container runtime (default)
5151
- `nvidia-legacy` - Legacy NVIDIA runtime
5252
- Empty string - Uses default cluster runtime. Used for CRIO environments
53-
54-
## Pause Image
55-
56-
The metadata collector uses an init container pattern. After metadata collection completes, a pause container keeps the pod running.
57-
58-
```yaml
59-
metadata-collector:
60-
pauseImage:
61-
repository: registry.k8s.io/pause
62-
tag: "3.10"
63-
```
64-
65-
### Parameters
66-
67-
#### pauseImage.repository
68-
Container image for the pause container.
69-
70-
#### pauseImage.tag
71-
Image tag for the pause container.

docs/metadata-collector.md

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ The Metadata Collector gathers comprehensive GPU and NVSwitch topology informati
66

77
Think of it as a hardware inventory scanner - it catalogs all GPUs, their connections, and NVSwitch fabric topology, making this information available for error analysis and troubleshooting.
88

9+
In addition to persisting the GPU and NVSwitch topology information from nodes in a local file, the Metadata Collector will also expose the pod to GPU mapping as an annotation on each pod requesting GPUs. This allows components running externally to the node to discover this device mapping through the Kubernetes API.
10+
911
### Why Do You Need This?
1012

1113
Health monitors need detailed hardware information to create accurate health events:
@@ -17,16 +19,20 @@ Health monitors need detailed hardware information to create accurate health eve
1719

1820
Without metadata collection, health monitors can only report generic errors without knowing which specific GPU or NVLink is affected.
1921

22+
Additionally, the node drainer module needs the pod to GPU mapping to map a set of pods to the impacted entities in health events:
23+
24+
- **Partial drains**: For GPU faults requiring component resets, the node drainer module will reference this mapping added to only drain pods leveraging that GPU
25+
2026
## How It Works
2127

22-
The Metadata Collector runs as an init container on each GPU node:
28+
The Metadata Collector runs as an regular container on each GPU node:
2329

2430
1. Initializes NVML (NVIDIA Management Library)
2531
2. Queries GPU information (UUID, PCI address, serial number, device name)
2632
3. Parses NVLink topology from nvidia-smi
2733
4. Builds NVSwitch fabric map
2834
5. Writes comprehensive metadata to JSON file
29-
6. Exits after collection completes
35+
6. Continually monitors the GPUs allocated to each pod on the given node and exposes this information as a pod annotation named dgxc.nvidia.com/devices
3036

3137
The JSON file persists on the node and is read by health monitors via a shared volume.
3238

@@ -40,10 +46,6 @@ metadata-collector:
4046

4147
# Runtime class for GPU access (omit for CRI-O environments)
4248
runtimeClassName: "nvidia"
43-
44-
pauseImage:
45-
repository: "registry.k8s.io/pause"
46-
tag: "3.10"
4749
```
4850
4951
### Configuration Options
@@ -62,6 +64,9 @@ The metadata collector gathers:
6264
- Device name/model
6365
- GPU index
6466

67+
### Pod Information
68+
- GPU UUIDs allocated to each pod
69+
6570
### NVLink Topology
6671
- NVLink connections between GPUs
6772
- Remote GPU endpoints for each link
@@ -86,11 +91,11 @@ Uses NVIDIA Management Library for reliable, direct hardware queries without ext
8691
### Topology Parsing
8792
Parses nvidia-smi output to build complete NVLink topology map showing GPU interconnections.
8893

89-
### Init Container Pattern
90-
Runs as init container, collects metadata once per pod lifecycle, then exits - minimal resource consumption.
91-
9294
### Shared Volume
9395
Writes metadata to shared volume accessible by health monitor sidecars for error correlation.
9496

9597
### JSON Output
9698
Structured JSON format for easy parsing and consumption by health monitors.
99+
100+
### Kubernetes API
101+
The pod to GPU mapping is exposed on pods objects as an annotation which can be consumed by external components.

metadata-collector/go.mod

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,61 @@ require (
99
github.com/nvidia/nvsentinel/commons v0.0.0
1010
github.com/nvidia/nvsentinel/data-models v0.0.0
1111
github.com/stretchr/testify v1.11.1
12+
google.golang.org/grpc v1.77.0
13+
k8s.io/api v0.35.0
14+
k8s.io/apimachinery v0.35.0
15+
k8s.io/client-go v0.35.0
16+
k8s.io/kubelet v0.35.0
1217
)
1318

1419
require (
1520
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
16-
github.com/kr/pretty v0.3.1 // indirect
21+
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
22+
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
23+
github.com/go-logr/logr v1.4.3 // indirect
24+
github.com/go-openapi/jsonpointer v0.22.3 // indirect
25+
github.com/go-openapi/jsonreference v0.21.3 // indirect
26+
github.com/go-openapi/swag v0.25.4 // indirect
27+
github.com/go-openapi/swag/cmdutils v0.25.4 // indirect
28+
github.com/go-openapi/swag/conv v0.25.4 // indirect
29+
github.com/go-openapi/swag/fileutils v0.25.4 // indirect
30+
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
31+
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
32+
github.com/go-openapi/swag/loading v0.25.4 // indirect
33+
github.com/go-openapi/swag/mangling v0.25.4 // indirect
34+
github.com/go-openapi/swag/netutils v0.25.4 // indirect
35+
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
36+
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
37+
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
38+
github.com/google/gnostic-models v0.7.1 // indirect
39+
github.com/google/uuid v1.6.0 // indirect
40+
github.com/json-iterator/go v1.1.12 // indirect
41+
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
42+
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
43+
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
1744
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
18-
github.com/rogpeppe/go-internal v1.14.1 // indirect
45+
github.com/stretchr/objx v0.5.2 // indirect
46+
github.com/x448/float16 v0.8.4 // indirect
47+
go.yaml.in/yaml/v2 v2.4.3 // indirect
48+
go.yaml.in/yaml/v3 v3.0.4 // indirect
1949
golang.org/x/net v0.47.0 // indirect
50+
golang.org/x/oauth2 v0.33.0 // indirect
2051
golang.org/x/sys v0.38.0 // indirect
52+
golang.org/x/term v0.37.0 // indirect
2153
golang.org/x/text v0.31.0 // indirect
54+
golang.org/x/time v0.14.0 // indirect
2255
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
23-
google.golang.org/grpc v1.77.0 // indirect
2456
google.golang.org/protobuf v1.36.11 // indirect
25-
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
57+
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
58+
gopkg.in/inf.v0 v0.9.1 // indirect
2659
gopkg.in/yaml.v3 v3.0.1 // indirect
60+
k8s.io/klog/v2 v2.130.1 // indirect
61+
k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e // indirect
62+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
63+
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
64+
sigs.k8s.io/randfill v1.0.0 // indirect
65+
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
66+
sigs.k8s.io/yaml v1.6.0 // indirect
2767
)
2868

2969
replace (

0 commit comments

Comments
 (0)