Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions data-models/pkg/model/pod_device_annotation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package model

const (
PodDeviceAnnotationName = "dgxc.nvidia.com/devices"
)

var (
EntityTypeToResourceNames = map[string][]string{
"GPU_UUID": {
"nvidia.com/gpu",
"nvidia.com/pgpu",
},
}
)

type DeviceAnnotation struct {
Devices map[string][]string `json:"devices"`
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "metadata-collector.fullname" . }}
labels:
{{- include "metadata-collector.labels" . | nindent 4 }}
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- patch
- apiGroups:
- ""
resources:
- nodes/proxy
verbs:
- get
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "metadata-collector.fullname" . }}
labels:
{{- include "metadata-collector.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "metadata-collector.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ include "metadata-collector.fullname" . }}
namespace: {{ .Release.Namespace }}
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,14 @@ spec:
labels:
{{- include "metadata-collector.selectorLabels" . | nindent 8 }}
spec:
serviceAccountName: {{ include "metadata-collector.fullname" . }}
{{- with .Values.global.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
hostNetwork: true
hostPID: true
initContainers:
containers:
- name: metadata-collector
securityContext:
runAsUser: 0
Expand All @@ -65,10 +66,9 @@ spec:
- name: sys
mountPath: /sys
readOnly: true
containers:
- name: pause
image: "{{ .Values.pauseImage.repository }}:{{ .Values.pauseImage.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
- mountPath: /var/lib/kubelet/pod-resources
name: pod-gpu-resources
readOnly: true
volumes:
- name: output
hostPath:
Expand All @@ -78,6 +78,10 @@ spec:
hostPath:
path: /sys
type: Directory
- name: pod-gpu-resources
hostPath:
path: /var/lib/kubelet/pod-resources
type: Directory
nodeSelector:
nvidia.com/gpu.present: "true"
nvsentinel.dgxc.nvidia.com/driver.installed: "true"
Expand All @@ -94,4 +98,4 @@ spec:
{{- end }}
{{- if .Values.runtimeClassName }}
runtimeClassName: {{ .Values.runtimeClassName }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "metadata-collector.fullname" . }}
labels:
{{- include "metadata-collector.labels" . | nindent 4 }}
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ image:
pullPolicy: IfNotPresent
tag: ""

pauseImage:
repository: registry.k8s.io/pause
tag: "3.10"

podAnnotations: {}

resources:
Expand Down
21 changes: 1 addition & 20 deletions docs/configuration/metadata-collector.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## Overview

The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This document covers all Helm configuration options for system administrators.
The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This component will also expose the pod-to-GPU mapping as an annotation on each pod requesting GPUs. This document covers all Helm configuration options for system administrators.

## Configuration Reference

Expand Down Expand Up @@ -50,22 +50,3 @@ Runtime class name that provides GPU device access. Required for NVML to query G
- `nvidia` - NVIDIA container runtime (default)
- `nvidia-legacy` - Legacy NVIDIA runtime
- Empty string - Uses default cluster runtime. Used for CRIO environments

## Pause Image

The metadata collector uses an init container pattern. After metadata collection completes, a pause container keeps the pod running.

```yaml
metadata-collector:
pauseImage:
repository: registry.k8s.io/pause
tag: "3.10"
```

### Parameters

#### pauseImage.repository
Container image for the pause container.

#### pauseImage.tag
Image tag for the pause container.
29 changes: 19 additions & 10 deletions docs/metadata-collector.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ The Metadata Collector gathers comprehensive GPU and NVSwitch topology informati

Think of it as a hardware inventory scanner - it catalogs all GPUs, their connections, and NVSwitch fabric topology, making this information available for error analysis and troubleshooting.

In addition to persisting the GPU and NVSwitch topology information from nodes in a local file, the Metadata Collector will also expose the pod-to-GPU mapping as an annotation on each pod requesting GPUs. This allows components running externally to the node to discover this device mapping through the Kubernetes API.

### Why Do You Need This?

Health monitors need detailed hardware information to create accurate health events:
Expand All @@ -15,21 +17,29 @@ Health monitors need detailed hardware information to create accurate health eve
- **Hardware identification**: Track GPU UUIDs, serial numbers, and device names
- **NVSwitch mapping**: Identify which NVSwitches connect which GPUs

Without metadata collection, health monitors can only report generic errors without knowing which specific GPU or NVLink is affected.
Without metadata collection, health monitors can only report generic errors without knowing which specific GPU or NVLink is affected. Additionally, the node drainer module needs the pod-to-GPU mapping to determine which set of pods is impacted by a given health event:

- **Partial drains**: For GPU faults requiring component resets, the node drainer module will reference this mapping to only drain pods leveraging that GPU

## How It Works

The Metadata Collector runs as an init container on each GPU node:
GPU and NVSwitch topology information collection:

1. Initializes NVML (NVIDIA Management Library)
2. Queries GPU information (UUID, PCI address, serial number, device name)
3. Parses NVLink topology from nvidia-smi
4. Builds NVSwitch fabric map
5. Writes comprehensive metadata to JSON file
6. Exits after collection completes

The JSON file persists on the node and is read by health monitors via a shared volume.

GPU-to-pod mapping annotation:

1. To discover all pods running on the given node, this component will call the Kubelet /pods HTTPS endpoint.
2. To discover the GPU devices allocated to each pod, this component will leverage the Kubelet PodResourcesLister gRPC service.
3. If any pod has a change in its GPU device allocation, we will update the tracking annotation on the pod object.
4. The Metadata Collector will run this logic in a loop on a fixed threshold to continually update the mapping for new and existing pods.

## Configuration

Configure the Metadata Collector through Helm values:
Expand All @@ -40,10 +50,6 @@ metadata-collector:

# Runtime class for GPU access (omit for CRI-O environments)
runtimeClassName: "nvidia"

pauseImage:
repository: "registry.k8s.io/pause"
tag: "3.10"
```
### Configuration Options
Expand All @@ -62,6 +68,9 @@ The metadata collector gathers:
- Device name/model
- GPU index

### Pod Information
- GPU UUIDs allocated to each pod

### NVLink Topology
- NVLink connections between GPUs
- Remote GPU endpoints for each link
Expand All @@ -86,11 +95,11 @@ Uses NVIDIA Management Library for reliable, direct hardware queries without ext
### Topology Parsing
Parses nvidia-smi output to build complete NVLink topology map showing GPU interconnections.

### Init Container Pattern
Runs as init container, collects metadata once per pod lifecycle, then exits - minimal resource consumption.

### Shared Volume
Writes metadata to shared volume accessible by health monitor sidecars for error correlation.

### JSON Output
Structured JSON format for easy parsing and consumption by health monitors.

### Kubernetes API
The pod-to-GPU mapping is exposed on pods objects as an annotation which can be consumed by external components.
48 changes: 44 additions & 4 deletions metadata-collector/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,61 @@ require (
github.com/nvidia/nvsentinel/commons v0.0.0
github.com/nvidia/nvsentinel/data-models v0.0.0
github.com/stretchr/testify v1.11.1
google.golang.org/grpc v1.77.0
k8s.io/api v0.35.0
k8s.io/apimachinery v0.35.0
k8s.io/client-go v0.35.0
k8s.io/kubelet v0.35.0
)

require (
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-openapi/jsonpointer v0.22.3 // indirect
github.com/go-openapi/jsonreference v0.21.3 // indirect
github.com/go-openapi/swag v0.25.4 // indirect
github.com/go-openapi/swag/cmdutils v0.25.4 // indirect
github.com/go-openapi/swag/conv v0.25.4 // indirect
github.com/go-openapi/swag/fileutils v0.25.4 // indirect
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
github.com/go-openapi/swag/loading v0.25.4 // indirect
github.com/go-openapi/swag/mangling v0.25.4 // indirect
github.com/go-openapi/swag/netutils v0.25.4 // indirect
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
github.com/google/gnostic-models v0.7.1 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/rogpeppe/go-internal v1.14.1 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/oauth2 v0.33.0 // indirect
golang.org/x/sys v0.38.0 // indirect
golang.org/x/term v0.37.0 // indirect
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.14.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
google.golang.org/grpc v1.77.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e // indirect
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)

replace (
Expand Down
Loading
Loading