NVIDIA · lalitadithya · Jan 8, 2026 · Jan 6, 2026 · Jan 8, 2026
diff --git a/data-models/pkg/model/pod_device_annotation.go b/data-models/pkg/model/pod_device_annotation.go
@@ -0,0 +1,32 @@
+// Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package model
+
+const (
+	PodDeviceAnnotationName = "dgxc.nvidia.com/devices"
+)
+
+var (
+	EntityTypeToResourceNames = map[string][]string{
+		"GPU_UUID": {
+			"nvidia.com/gpu",
+			"nvidia.com/pgpu",
+		},
+	}
+)
+
+type DeviceAnnotation struct {
+	Devices map[string][]string `json:"devices"`
+}
diff --git a/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/clusterrole.yaml b/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/clusterrole.yaml
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "metadata-collector.fullname" . }}
+  labels:
+    {{- include "metadata-collector.labels" . | nindent 4 }}
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - pods
+    verbs:
+      - patch
+  - apiGroups:
+      - ""
+    resources:
+      - nodes/proxy
+    verbs:
+      - get
diff --git a/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/clusterrolebinding.yaml b/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/clusterrolebinding.yaml
@@ -0,0 +1,28 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "metadata-collector.fullname" . }}
+  labels:
+    {{- include "metadata-collector.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "metadata-collector.fullname" . }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "metadata-collector.fullname" . }}
+    namespace: {{ .Release.Namespace }}
diff --git a/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/daemonset.yaml b/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/daemonset.yaml
@@ -35,13 +35,14 @@ spec:
       labels:
         {{- include "metadata-collector.selectorLabels" . | nindent 8 }}
     spec:
+      serviceAccountName: {{ include "metadata-collector.fullname" . }}
       {{- with .Values.global.imagePullSecrets }}
       imagePullSecrets:
         {{- toYaml . | nindent 8 }}
       {{- end }}
       hostNetwork: true
       hostPID: true
-      initContainers:
+      containers:
         - name: metadata-collector
           securityContext:
             runAsUser: 0
@@ -65,10 +66,9 @@ spec:
             - name: sys
               mountPath: /sys
               readOnly: true
-      containers:
-        - name: pause
-          image: "{{ .Values.pauseImage.repository }}:{{ .Values.pauseImage.tag }}"
-          imagePullPolicy: {{ .Values.image.pullPolicy }}
+            - mountPath: /var/lib/kubelet/pod-resources
+              name: pod-gpu-resources
+              readOnly: true
       volumes:
         - name: output
           hostPath:
@@ -78,6 +78,10 @@ spec:
           hostPath:
             path: /sys
             type: Directory
+        - name: pod-gpu-resources
+          hostPath:
+            path: /var/lib/kubelet/pod-resources
+            type: Directory
       nodeSelector:
         nvidia.com/gpu.present: "true"
         nvsentinel.dgxc.nvidia.com/driver.installed: "true"
@@ -94,4 +98,4 @@ spec:
       {{- end }}
       {{- if .Values.runtimeClassName }}
       runtimeClassName: {{ .Values.runtimeClassName }}
-      {{- end }}
+      {{- end }}
diff --git a/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/serviceaccount.yaml b/distros/kubernetes/nvsentinel/charts/metadata-collector/templates/serviceaccount.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "metadata-collector.fullname" . }}
+  labels:
+    {{- include "metadata-collector.labels" . | nindent 4 }}
diff --git a/distros/kubernetes/nvsentinel/charts/metadata-collector/values.yaml b/distros/kubernetes/nvsentinel/charts/metadata-collector/values.yaml
@@ -17,10 +17,6 @@ image:
   pullPolicy: IfNotPresent
   tag: ""
 
-pauseImage:
-  repository: registry.k8s.io/pause
-  tag: "3.10"
-
 podAnnotations: {}
 
 resources: 

diff --git a/docs/configuration/metadata-collector.md b/docs/configuration/metadata-collector.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This document covers all Helm configuration options for system administrators.
+The Metadata Collector module collects GPU metadata using NVIDIA NVML (Management Library) and writes it to a shared file. Other modules read this file to enrich health events with GPU serial numbers, UUIDs, and topology information. This component will also expose the pod-to-GPU mapping as an annotation on each pod requesting GPUs. This document covers all Helm configuration options for system administrators.
 
 ## Configuration Reference
 
@@ -50,22 +50,3 @@ Runtime class name that provides GPU device access. Required for NVML to query G
 - `nvidia` - NVIDIA container runtime (default)
 - `nvidia-legacy` - Legacy NVIDIA runtime
 - Empty string - Uses default cluster runtime. Used for CRIO environments
-
-## Pause Image
-
-The metadata collector uses an init container pattern. After metadata collection completes, a pause container keeps the pod running.
-
-```yaml
-metadata-collector:
-  pauseImage:
-    repository: registry.k8s.io/pause
-    tag: "3.10"
-```
-
-### Parameters
-
-#### pauseImage.repository
-Container image for the pause container.
-
-#### pauseImage.tag
-Image tag for the pause container.
diff --git a/docs/metadata-collector.md b/docs/metadata-collector.md
@@ -6,6 +6,8 @@ The Metadata Collector gathers comprehensive GPU and NVSwitch topology informati
 
 Think of it as a hardware inventory scanner - it catalogs all GPUs, their connections, and NVSwitch fabric topology, making this information available for error analysis and troubleshooting.
 
+In addition to persisting the GPU and NVSwitch topology information from nodes in a local file, the Metadata Collector will also expose the pod-to-GPU mapping as an annotation on each pod requesting GPUs. This allows components running externally to the node to discover this device mapping through the Kubernetes API.
+
 ### Why Do You Need This?
 
 Health monitors need detailed hardware information to create accurate health events:
@@ -15,21 +17,29 @@ Health monitors need detailed hardware information to create accurate health eve
 - **Hardware identification**: Track GPU UUIDs, serial numbers, and device names
 - **NVSwitch mapping**: Identify which NVSwitches connect which GPUs
 
-Without metadata collection, health monitors can only report generic errors without knowing which specific GPU or NVLink is affected.
+Without metadata collection, health monitors can only report generic errors without knowing which specific GPU or NVLink is affected. Additionally, the node drainer module needs the pod-to-GPU mapping to determine which set of pods is impacted by a given health event:
+
+- **Partial drains**: For GPU faults requiring component resets, the node drainer module will reference this mapping to only drain pods leveraging that GPU
 
 ## How It Works
 
-The Metadata Collector runs as an init container on each GPU node:
+GPU and NVSwitch topology information collection:
 
 1. Initializes NVML (NVIDIA Management Library)
 2. Queries GPU information (UUID, PCI address, serial number, device name)
 3. Parses NVLink topology from nvidia-smi
 4. Builds NVSwitch fabric map
 5. Writes comprehensive metadata to JSON file
-6. Exits after collection completes
 
 The JSON file persists on the node and is read by health monitors via a shared volume.
 
+GPU-to-pod mapping annotation:
+
+1. To discover all pods running on the given node, this component will call the Kubelet /pods HTTPS endpoint.
+2. To discover the GPU devices allocated to each pod, this component will leverage the Kubelet PodResourcesLister gRPC service.
+3. If any pod has a change in its GPU device allocation, we will update the tracking annotation on the pod object.
+4. The Metadata Collector will run this logic in a loop on a fixed threshold to continually update the mapping for new and existing pods.
+
 ## Configuration
 
 Configure the Metadata Collector through Helm values:
@@ -40,10 +50,6 @@ metadata-collector:
 
   # Runtime class for GPU access (omit for CRI-O environments)
   runtimeClassName: "nvidia"
-
-  pauseImage:
-    repository: "registry.k8s.io/pause"
-    tag: "3.10"
 ```
 
 ### Configuration Options
@@ -62,6 +68,9 @@ The metadata collector gathers:
 - Device name/model
 - GPU index
 
+### Pod Information
+- GPU UUIDs allocated to each pod
+
 ### NVLink Topology
 - NVLink connections between GPUs
 - Remote GPU endpoints for each link
@@ -86,11 +95,11 @@ Uses NVIDIA Management Library for reliable, direct hardware queries without ext
 ### Topology Parsing
 Parses nvidia-smi output to build complete NVLink topology map showing GPU interconnections.
 
-### Init Container Pattern
-Runs as init container, collects metadata once per pod lifecycle, then exits - minimal resource consumption.
-
 ### Shared Volume
 Writes metadata to shared volume accessible by health monitor sidecars for error correlation.
 
 ### JSON Output
 Structured JSON format for easy parsing and consumption by health monitors.
+
+### Kubernetes API
+The pod-to-GPU mapping is exposed on pods objects as an annotation which can be consumed by external components.
diff --git a/metadata-collector/go.mod b/metadata-collector/go.mod
@@ -9,21 +9,61 @@ require (
 	github.com/nvidia/nvsentinel/commons v0.0.0
 	github.com/nvidia/nvsentinel/data-models v0.0.0
 	github.com/stretchr/testify v1.11.1
+	google.golang.org/grpc v1.77.0
+	k8s.io/api v0.35.0
+	k8s.io/apimachinery v0.35.0
+	k8s.io/client-go v0.35.0
+	k8s.io/kubelet v0.35.0
 )
 
 require (
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
-	github.com/kr/pretty v0.3.1 // indirect
+	github.com/emicklei/go-restful/v3 v3.13.0 // indirect
+	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-openapi/jsonpointer v0.22.3 // indirect
+	github.com/go-openapi/jsonreference v0.21.3 // indirect
+	github.com/go-openapi/swag v0.25.4 // indirect
+	github.com/go-openapi/swag/cmdutils v0.25.4 // indirect
+	github.com/go-openapi/swag/conv v0.25.4 // indirect
+	github.com/go-openapi/swag/fileutils v0.25.4 // indirect
+	github.com/go-openapi/swag/jsonname v0.25.4 // indirect
+	github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
+	github.com/go-openapi/swag/loading v0.25.4 // indirect
+	github.com/go-openapi/swag/mangling v0.25.4 // indirect
+	github.com/go-openapi/swag/netutils v0.25.4 // indirect
+	github.com/go-openapi/swag/stringutils v0.25.4 // indirect
+	github.com/go-openapi/swag/typeutils v0.25.4 // indirect
+	github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
+	github.com/google/gnostic-models v0.7.1 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
-	github.com/rogpeppe/go-internal v1.14.1 // indirect
+	github.com/stretchr/objx v0.5.2 // indirect
+	github.com/x448/float16 v0.8.4 // indirect
+	go.yaml.in/yaml/v2 v2.4.3 // indirect
+	go.yaml.in/yaml/v3 v3.0.4 // indirect
 	golang.org/x/net v0.47.0 // indirect
+	golang.org/x/oauth2 v0.33.0 // indirect
 	golang.org/x/sys v0.38.0 // indirect
+	golang.org/x/term v0.37.0 // indirect
 	golang.org/x/text v0.31.0 // indirect
+	golang.org/x/time v0.14.0 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
-	google.golang.org/grpc v1.77.0 // indirect
 	google.golang.org/protobuf v1.36.11 // indirect
-	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
+	gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
+	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
+	k8s.io/klog/v2 v2.130.1 // indirect
+	k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e // indirect
+	k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
+	sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
+	sigs.k8s.io/randfill v1.0.0 // indirect
+	sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
+	sigs.k8s.io/yaml v1.6.0 // indirect
 )
 
 replace (