Skip to content

Commit 3705a73

Browse files
committed
feature: update metadata-collector to track the GPUs allocated to each pod
Signed-off-by: Nathan Herz <[email protected]>
1 parent fd4466e commit 3705a73

File tree

15 files changed

+2066
-26
lines changed

15 files changed

+2066
-26
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package model
16+
17+
const (
18+
PodDeviceAnnotationName = "dgxc.nvidia.com/devices"
19+
)
20+
21+
var (
22+
EntityTypeToResourceNames = map[string][]string{
23+
"GPU_UUID": {
24+
"nvidia.com/gpu",
25+
"nvidia.com/pgpu",
26+
},
27+
}
28+
)
29+
30+
type DeviceAnnotation struct {
31+
Devices map[string][]string `json:"devices"`
32+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRole
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}
21+
rules:
22+
- apiGroups:
23+
- ""
24+
resources:
25+
- pods
26+
verbs:
27+
- patch
28+
- apiGroups:
29+
- ""
30+
resources:
31+
- nodes/proxy
32+
verbs:
33+
- get
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRoleBinding
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}
21+
roleRef:
22+
apiGroup: rbac.authorization.k8s.io
23+
kind: ClusterRole
24+
name: {{ include "metadata-collector.fullname" . }}
25+
subjects:
26+
- kind: ServiceAccount
27+
name: {{ include "metadata-collector.fullname" . }}
28+
namespace: {{ .Release.Namespace }}

distros/kubernetes/nvsentinel/charts/metadata-collector/templates/daemonset.yaml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,14 @@ spec:
3535
labels:
3636
{{- include "metadata-collector.selectorLabels" . | nindent 8 }}
3737
spec:
38+
serviceAccountName: {{ include "metadata-collector.fullname" . }}
3839
{{- with .Values.global.imagePullSecrets }}
3940
imagePullSecrets:
4041
{{- toYaml . | nindent 8 }}
4142
{{- end }}
4243
hostNetwork: true
4344
hostPID: true
44-
initContainers:
45+
containers:
4546
- name: metadata-collector
4647
securityContext:
4748
runAsUser: 0
@@ -65,10 +66,9 @@ spec:
6566
- name: sys
6667
mountPath: /sys
6768
readOnly: true
68-
containers:
69-
- name: pause
70-
image: "{{ .Values.pauseImage.repository }}:{{ .Values.pauseImage.tag }}"
71-
imagePullPolicy: {{ .Values.image.pullPolicy }}
69+
- mountPath: /var/lib/kubelet/pod-resources
70+
name: pod-gpu-resources
71+
readOnly: true
7272
volumes:
7373
- name: output
7474
hostPath:
@@ -78,6 +78,10 @@ spec:
7878
hostPath:
7979
path: /sys
8080
type: Directory
81+
- name: pod-gpu-resources
82+
hostPath:
83+
path: /var/lib/kubelet/pod-resources
84+
type: Directory
8185
nodeSelector:
8286
nvidia.com/gpu.present: "true"
8387
nvsentinel.dgxc.nvidia.com/driver.installed: "true"
@@ -94,4 +98,4 @@ spec:
9498
{{- end }}
9599
{{- if .Values.runtimeClassName }}
96100
runtimeClassName: {{ .Values.runtimeClassName }}
97-
{{- end }}
101+
{{- end }}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ServiceAccount
17+
metadata:
18+
name: {{ include "metadata-collector.fullname" . }}
19+
labels:
20+
{{- include "metadata-collector.labels" . | nindent 4 }}

distros/kubernetes/nvsentinel/charts/metadata-collector/values.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@ image:
1717
pullPolicy: IfNotPresent
1818
tag: ""
1919

20-
pauseImage:
21-
repository: registry.k8s.io/pause
22-
tag: "3.10"
23-
2420
podAnnotations: {}
2521

2622
resources:

metadata-collector/go.mod

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,61 @@ require (
99
github.com/nvidia/nvsentinel/commons v0.0.0
1010
github.com/nvidia/nvsentinel/data-models v0.0.0
1111
github.com/stretchr/testify v1.11.1
12+
google.golang.org/grpc v1.77.0
13+
k8s.io/api v0.35.0
14+
k8s.io/apimachinery v0.35.0
15+
k8s.io/client-go v0.35.0
16+
k8s.io/kubelet v0.35.0
1217
)
1318

1419
require (
1520
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
16-
github.com/kr/pretty v0.3.1 // indirect
21+
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
22+
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
23+
github.com/go-logr/logr v1.4.3 // indirect
24+
github.com/go-openapi/jsonpointer v0.22.3 // indirect
25+
github.com/go-openapi/jsonreference v0.21.3 // indirect
26+
github.com/go-openapi/swag v0.25.4 // indirect
27+
github.com/go-openapi/swag/cmdutils v0.25.4 // indirect
28+
github.com/go-openapi/swag/conv v0.25.4 // indirect
29+
github.com/go-openapi/swag/fileutils v0.25.4 // indirect
30+
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
31+
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
32+
github.com/go-openapi/swag/loading v0.25.4 // indirect
33+
github.com/go-openapi/swag/mangling v0.25.4 // indirect
34+
github.com/go-openapi/swag/netutils v0.25.4 // indirect
35+
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
36+
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
37+
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
38+
github.com/google/gnostic-models v0.7.1 // indirect
39+
github.com/google/uuid v1.6.0 // indirect
40+
github.com/json-iterator/go v1.1.12 // indirect
41+
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
42+
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
43+
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
1744
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
18-
github.com/rogpeppe/go-internal v1.14.1 // indirect
45+
github.com/stretchr/objx v0.5.2 // indirect
46+
github.com/x448/float16 v0.8.4 // indirect
47+
go.yaml.in/yaml/v2 v2.4.3 // indirect
48+
go.yaml.in/yaml/v3 v3.0.4 // indirect
1949
golang.org/x/net v0.47.0 // indirect
50+
golang.org/x/oauth2 v0.33.0 // indirect
2051
golang.org/x/sys v0.38.0 // indirect
52+
golang.org/x/term v0.37.0 // indirect
2153
golang.org/x/text v0.31.0 // indirect
54+
golang.org/x/time v0.14.0 // indirect
2255
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
23-
google.golang.org/grpc v1.77.0 // indirect
2456
google.golang.org/protobuf v1.36.11 // indirect
25-
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
57+
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
58+
gopkg.in/inf.v0 v0.9.1 // indirect
2659
gopkg.in/yaml.v3 v3.0.1 // indirect
60+
k8s.io/klog/v2 v2.130.1 // indirect
61+
k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e // indirect
62+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
63+
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
64+
sigs.k8s.io/randfill v1.0.0 // indirect
65+
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
66+
sigs.k8s.io/yaml v1.6.0 // indirect
2767
)
2868

2969
replace (

0 commit comments

Comments
 (0)