Skip to content

Commit 4038245

Browse files
authored
feature: update metadata-collector to track pod GPU device allocation (#663)
Signed-off-by: Nathan Herz <[email protected]>
1 parent d9b8056 commit 4038245

File tree

17 files changed

+2113
-56
lines changed

17 files changed

+2113
-56
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package model
16+
17+
const (
18+
PodDeviceAnnotationName = "dgxc.nvidia.com/devices"
19+
)
20+
21+
var (
22+
EntityTypeToResourceNames = map[string][]string{
23+
"GPU_UUID": {
24+
"nvidia.com/gpu",
25+
"nvidia.com/pgpu",
26+
},
27+
}
28+
)
29+
30+
type DeviceAnnotation struct {
31+
Devices map[string][]string `json:"devices"`
32+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRole
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}
21+
rules:
22+
- apiGroups:
23+
- ""
24+
resources:
25+
- pods
26+
verbs:
27+
- patch
28+
- apiGroups:
29+
- ""
30+
resources:
31+
- nodes/proxy
32+
verbs:
33+
- get
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRoleBinding
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}
21+
roleRef:
22+
apiGroup: rbac.authorization.k8s.io
23+
kind: ClusterRole
24+
name: {{ include "metadata-collector.fullname" . }}
25+
subjects:
26+
- kind: ServiceAccount
27+
name: {{ include "metadata-collector.fullname" . }}
28+
namespace: {{ .Release.Namespace }}

distros/kubernetes/nvsentinel/charts/metadata-collector/templates/daemonset.yaml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,14 @@ spec:
3535
labels:
3636
{{- include "metadata-collector.selectorLabels" . | nindent 8 }}
3737
spec:
38+
serviceAccountName: {{ include "metadata-collector.fullname" . }}
3839
{{- with .Values.global.imagePullSecrets }}
3940
imagePullSecrets:
4041
{{- toYaml . | nindent 8 }}
4142
{{- end }}
4243
hostNetwork: true
4344
hostPID: true
44-
initContainers:
45+
containers:
4546
- name: metadata-collector
4647
securityContext:
4748
runAsUser: 0
@@ -65,10 +66,9 @@ spec:
6566
- name: sys
6667
mountPath: /sys
6768
readOnly: true
68-
containers:
69-
- name: pause
70-
image: "{{ .Values.pauseImage.repository }}:{{ .Values.pauseImage.tag }}"
71-
imagePullPolicy: {{ .Values.image.pullPolicy }}
69+
- mountPath: /var/lib/kubelet/pod-resources
70+
name: pod-gpu-resources
71+
readOnly: true
7272
volumes:
7373
- name: output
7474
hostPath:
@@ -78,6 +78,10 @@ spec:
7878
hostPath:
7979
path: /sys
8080
type: Directory
81+
- name: pod-gpu-resources
82+
hostPath:
83+
path: /var/lib/kubelet/pod-resources
84+
type: Directory
8185
nodeSelector:
8286
nvidia.com/gpu.present: "true"
8387
nvsentinel.dgxc.nvidia.com/driver.installed: "true"
@@ -94,4 +98,4 @@ spec:
9498
{{- end }}
9599
{{- if .Values.runtimeClassName }}
96100
runtimeClassName: {{ .Values.runtimeClassName }}
97-
{{- end }}
101+
{{- end }}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ServiceAccount
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}

distros/kubernetes/nvsentinel/charts/metadata-collector/values.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@ image:
1717
pullPolicy: IfNotPresent
1818
tag: ""
1919

20-
pauseImage:
21-
repository: registry.k8s.io/pause
22-
tag: "3.10"
23-
2420
podAnnotations: {}
2521

2622
resources:

docs/configuration/metadata-collector.md

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## Overview
44

5-
The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This document covers all Helm configuration options for system administrators.
5+
The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This component will also expose the pod-to-GPU mapping as an annotation on each pod requesting GPUs. This document covers all Helm configuration options for system administrators.
66

77
## Configuration Reference
88

@@ -50,22 +50,3 @@ Runtime class name that provides GPU device access. Required for NVML to query G
5050
- `nvidia` - NVIDIA container runtime (default)
5151
- `nvidia-legacy` - Legacy NVIDIA runtime
5252
- Empty string - Uses default cluster runtime. Used for CRIO environments
53-
54-
## Pause Image
55-
56-
The metadata collector uses an init container pattern. After metadata collection completes, a pause container keeps the pod running.
57-
58-
```yaml
59-
metadata-collector:
60-
pauseImage:
61-
repository: registry.k8s.io/pause
62-
tag: "3.10"
63-
```
64-
65-
### Parameters
66-
67-
#### pauseImage.repository
68-
Container image for the pause container.
69-
70-
#### pauseImage.tag
71-
Image tag for the pause container.

docs/metadata-collector.md

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ The Metadata Collector gathers comprehensive GPU and NVSwitch topology informati
66

77
Think of it as a hardware inventory scanner - it catalogs all GPUs, their connections, and NVSwitch fabric topology, making this information available for error analysis and troubleshooting.
88

9+
In addition to persisting the GPU and NVSwitch topology information from nodes in a local file, the Metadata Collector will also expose the pod-to-GPU mapping as an annotation on each pod requesting GPUs. This allows components running externally to the node to discover this device mapping through the Kubernetes API.
10+
911
### Why Do You Need This?
1012

1113
Health monitors need detailed hardware information to create accurate health events:
@@ -15,21 +17,29 @@ Health monitors need detailed hardware information to create accurate health eve
1517
- **Hardware identification**: Track GPU UUIDs, serial numbers, and device names
1618
- **NVSwitch mapping**: Identify which NVSwitches connect which GPUs
1719

18-
Without metadata collection, health monitors can only report generic errors without knowing which specific GPU or NVLink is affected.
20+
Without metadata collection, health monitors can only report generic errors without knowing which specific GPU or NVLink is affected. Additionally, the node drainer module needs the pod-to-GPU mapping to determine which set of pods is impacted by a given health event:
21+
22+
- **Partial drains**: For GPU faults requiring component resets, the node drainer module will reference this mapping to only drain pods leveraging that GPU
1923

2024
## How It Works
2125

22-
The Metadata Collector runs as an init container on each GPU node:
26+
GPU and NVSwitch topology information collection:
2327

2428
1. Initializes NVML (NVIDIA Management Library)
2529
2. Queries GPU information (UUID, PCI address, serial number, device name)
2630
3. Parses NVLink topology from nvidia-smi
2731
4. Builds NVSwitch fabric map
2832
5. Writes comprehensive metadata to JSON file
29-
6. Exits after collection completes
3033

3134
The JSON file persists on the node and is read by health monitors via a shared volume.
3235

36+
GPU-to-pod mapping annotation:
37+
38+
1. To discover all pods running on the given node, this component will call the Kubelet /pods HTTPS endpoint.
39+
2. To discover the GPU devices allocated to each pod, this component will leverage the Kubelet PodResourcesLister gRPC service.
40+
3. If any pod has a change in its GPU device allocation, we will update the tracking annotation on the pod object.
41+
4. The Metadata Collector will run this logic in a loop on a fixed threshold to continually update the mapping for new and existing pods.
42+
3343
## Configuration
3444

3545
Configure the Metadata Collector through Helm values:
@@ -40,10 +50,6 @@ metadata-collector:
4050

4151
# Runtime class for GPU access (omit for CRI-O environments)
4252
runtimeClassName: "nvidia"
43-
44-
pauseImage:
45-
repository: "registry.k8s.io/pause"
46-
tag: "3.10"
4753
```
4854
4955
### Configuration Options
@@ -62,6 +68,9 @@ The metadata collector gathers:
6268
- Device name/model
6369
- GPU index
6470

71+
### Pod Information
72+
- GPU UUIDs allocated to each pod
73+
6574
### NVLink Topology
6675
- NVLink connections between GPUs
6776
- Remote GPU endpoints for each link
@@ -86,11 +95,11 @@ Uses NVIDIA Management Library for reliable, direct hardware queries without ext
8695
### Topology Parsing
8796
Parses nvidia-smi output to build complete NVLink topology map showing GPU interconnections.
8897

89-
### Init Container Pattern
90-
Runs as init container, collects metadata once per pod lifecycle, then exits - minimal resource consumption.
91-
9298
### Shared Volume
9399
Writes metadata to shared volume accessible by health monitor sidecars for error correlation.
94100

95101
### JSON Output
96102
Structured JSON format for easy parsing and consumption by health monitors.
103+
104+
### Kubernetes API
105+
The pod-to-GPU mapping is exposed on pods objects as an annotation which can be consumed by external components.

metadata-collector/go.mod

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,61 @@ require (
99
github.com/nvidia/nvsentinel/commons v0.0.0
1010
github.com/nvidia/nvsentinel/data-models v0.0.0
1111
github.com/stretchr/testify v1.11.1
12+
google.golang.org/grpc v1.77.0
13+
k8s.io/api v0.35.0
14+
k8s.io/apimachinery v0.35.0
15+
k8s.io/client-go v0.35.0
16+
k8s.io/kubelet v0.35.0
1217
)
1318

1419
require (
1520
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
16-
github.com/kr/pretty v0.3.1 // indirect
21+
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
22+
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
23+
github.com/go-logr/logr v1.4.3 // indirect
24+
github.com/go-openapi/jsonpointer v0.22.3 // indirect
25+
github.com/go-openapi/jsonreference v0.21.3 // indirect
26+
github.com/go-openapi/swag v0.25.4 // indirect
27+
github.com/go-openapi/swag/cmdutils v0.25.4 // indirect
28+
github.com/go-openapi/swag/conv v0.25.4 // indirect
29+
github.com/go-openapi/swag/fileutils v0.25.4 // indirect
30+
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
31+
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
32+
github.com/go-openapi/swag/loading v0.25.4 // indirect
33+
github.com/go-openapi/swag/mangling v0.25.4 // indirect
34+
github.com/go-openapi/swag/netutils v0.25.4 // indirect
35+
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
36+
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
37+
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
38+
github.com/google/gnostic-models v0.7.1 // indirect
39+
github.com/google/uuid v1.6.0 // indirect
40+
github.com/json-iterator/go v1.1.12 // indirect
41+
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
42+
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
43+
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
1744
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
18-
github.com/rogpeppe/go-internal v1.14.1 // indirect
45+
github.com/stretchr/objx v0.5.2 // indirect
46+
github.com/x448/float16 v0.8.4 // indirect
47+
go.yaml.in/yaml/v2 v2.4.3 // indirect
48+
go.yaml.in/yaml/v3 v3.0.4 // indirect
1949
golang.org/x/net v0.47.0 // indirect
50+
golang.org/x/oauth2 v0.33.0 // indirect
2051
golang.org/x/sys v0.38.0 // indirect
52+
golang.org/x/term v0.37.0 // indirect
2153
golang.org/x/text v0.31.0 // indirect
54+
golang.org/x/time v0.14.0 // indirect
2255
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
23-
google.golang.org/grpc v1.77.0 // indirect
2456
google.golang.org/protobuf v1.36.11 // indirect
25-
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
57+
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
58+
gopkg.in/inf.v0 v0.9.1 // indirect
2659
gopkg.in/yaml.v3 v3.0.1 // indirect
60+
k8s.io/klog/v2 v2.130.1 // indirect
61+
k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e // indirect
62+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
63+
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
64+
sigs.k8s.io/randfill v1.0.0 // indirect
65+
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
66+
sigs.k8s.io/yaml v1.6.0 // indirect
2767
)
2868

2969
replace (

0 commit comments

Comments
 (0)