Skip to content

Commit 546bfe1

Browse files
authored
DCGM-Exporter 4.0.3 (#454)
Update to DCGM 4.1.1 Add support for GPU sharing metrics in k8s (@pintohutch)
1 parent ba26604 commit 546bfe1

16 files changed

+47
-35
lines changed

.devcontainer/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,4 @@ ENV PATH $PATH:/usr/local/go/bin
7474
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
7575
# disable all constraints on the configurations required by NVIDIA container toolkit
7676
ENV NVIDIA_DISABLE_REQUIRE="true"
77-
ENV NVIDIA_VISIBLE_DEVICES=all
77+
ENV NVIDIA_VISIBLE_DEVICES=all

.devcontainer/devcontainer.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@
1717
"--security-opt",
1818
"seccomp=unconfined",
1919
"--gpus=all"
20-
]
21-
}
20+
],
21+
"postStartCommand": "docker run --privileged --rm tonistiigi/binfmt --install all"
22+
}

Makefile

+6-5
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ DOCKERCMD := docker --debug buildx build
3030
MODULE := github.com/NVIDIA/dcgm-exporter
3131

3232
.PHONY: all binary install check-format local
33-
all: update-version ubuntu22.04 ubi9
33+
all: ubuntu22.04 ubi9
3434

35-
binary: update-version
35+
binary:
3636
cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
3737

3838
test-main: generate
@@ -46,7 +46,7 @@ check-format:
4646
test $$(gofmt -l pkg | tee /dev/stderr | wc -l) -eq 0
4747
test $$(gofmt -l cmd | tee /dev/stderr | wc -l) -eq 0
4848

49-
push: update-version
49+
push:
5050
$(MAKE) ubuntu22.04 OUTPUT=type=registry
5151
$(MAKE) ubi9 OUTPUT=type=registry
5252

@@ -60,13 +60,13 @@ endif
6060
ubi%: DOCKERFILE = docker/Dockerfile.ubi
6161
ubi%: --docker-build-%
6262
@
63-
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubi9
63+
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.8.0-base-ubi9
6464
ubi9: IMAGE_TAG = ubi9
6565

6666
ubuntu%: DOCKERFILE = docker/Dockerfile.ubuntu
6767
ubuntu%: --docker-build-%
6868
@
69-
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
69+
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.8.0-base-ubuntu22.04
7070
ubuntu22.04: IMAGE_TAG = ubuntu22.04
7171

7272

@@ -76,6 +76,7 @@ ubuntu22.04: IMAGE_TAG = ubuntu22.04
7676
$(DOCKERCMD) --pull \
7777
--output $(OUTPUT) \
7878
--progress=plain \
79+
--no-cache \
7980
--platform $(PLATFORMS) \
8081
--build-arg BASEIMAGE="$(BASE_IMAGE)" \
8182
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
1111
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
1212

1313
```shell
14-
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04
14+
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.3-ubuntu22.04
1515
curl localhost:9400/metrics
1616
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
1717
# TYPE DCGM_FI_DEV_SM_CLOCK gauge

dcgm-exporter.yaml

+6-6
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,23 @@ metadata:
1818
name: "dcgm-exporter"
1919
labels:
2020
app.kubernetes.io/name: "dcgm-exporter"
21-
app.kubernetes.io/version: "4.0.0"
21+
app.kubernetes.io/version: "4.0.3"
2222
spec:
2323
updateStrategy:
2424
type: RollingUpdate
2525
selector:
2626
matchLabels:
2727
app.kubernetes.io/name: "dcgm-exporter"
28-
app.kubernetes.io/version: "4.0.0"
28+
app.kubernetes.io/version: "4.0.3"
2929
template:
3030
metadata:
3131
labels:
3232
app.kubernetes.io/name: "dcgm-exporter"
33-
app.kubernetes.io/version: "4.0.0"
33+
app.kubernetes.io/version: "4.0.3"
3434
name: "dcgm-exporter"
3535
spec:
3636
containers:
37-
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04"
37+
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.3-ubuntu22.04"
3838
env:
3939
- name: "DCGM_EXPORTER_LISTEN"
4040
value: ":9400"
@@ -66,11 +66,11 @@ metadata:
6666
name: "dcgm-exporter"
6767
labels:
6868
app.kubernetes.io/name: "dcgm-exporter"
69-
app.kubernetes.io/version: "4.0.0"
69+
app.kubernetes.io/version: "4.0.3"
7070
spec:
7171
selector:
7272
app.kubernetes.io/name: "dcgm-exporter"
73-
app.kubernetes.io/version: "4.0.0"
73+
app.kubernetes.io/version: "4.0.3"
7474
ports:
7575
- name: "metrics"
7676
port: 9400

deployment/Chart.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
apiVersion: v2
22
name: dcgm-exporter
33
description: A Helm chart for DCGM exporter
4-
version: "4.0.0"
4+
version: "4.0.3"
55
kubeVersion: ">= 1.19.0-0"
6-
appVersion: "4.0.0"
6+
appVersion: "4.0.3"
77
sources:
88
- https://github.com/nvidia/dcgm-exporter
99
home: https://github.com/nvidia/dcgm-exporter/

deployment/values.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ image:
1717
pullPolicy: IfNotPresent
1818
# Image tag defaults to AppVersion, but you can use the tag key
1919
# for the image tag, e.g:
20-
tag: 4.0.0-4.0.0-ubuntu22.04
20+
tag: 4.1.1-4.0.3-ubuntu22.04
2121

2222
# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
2323
# to stop profiling metrics from DCGM

docker/Dockerfile.ubi

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.6.3-base-ubi9
1+
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.8.0-base-ubi9
22

33
FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder
44

docker/Dockerfile.ubuntu

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
1+
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.8.0-base-ubuntu22.04
22

33
FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder
44

@@ -72,7 +72,6 @@ COPY etc /etc/dcgm-exporter
7272
ENV DEBIAN_FRONTEND=noninteractive
7373
RUN echo "$TARGETARCH" && apt-get -qq update && apt-get -qq install -y --no-install-recommends \
7474
datacenter-gpu-manager-4-core libcap2-bin \
75-
&& apt-get -qq purge --autoremove -y openssl \
7675
&& apt-get -qq -y clean \
7776
&& apt-get -qq -y autoclean \
7877
&& apt-get -qq autoremove -y \

docker/dcgm-exporter-entrypoint.sh

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env bash
22
set -euo pipefail
33

4+
ldconfig #Must refresh the cache to find libdcgm.so
45
# We want to setcap only when the container is started with the right caps
56
DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter))
67
if [ -z "$NO_SETCAP" ]; then

go.mod

+5-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ go 1.22.0
55
toolchain go1.22.9
66

77
require (
8-
github.com/NVIDIA/go-dcgm v0.0.0-20250106155650-850266c9c8a5
8+
github.com/NVIDIA/go-dcgm v0.0.0-20250206205700-2b08cf88dcdc
99
github.com/NVIDIA/go-nvml v0.12.4-0
1010
github.com/avast/retry-go/v4 v4.6.0
1111
github.com/bits-and-blooms/bitset v1.17.0
@@ -145,11 +145,11 @@ require (
145145
go.opentelemetry.io/otel/metric v1.28.0 // indirect
146146
go.opentelemetry.io/otel/trace v1.28.0 // indirect
147147
go.starlark.net v0.0.0-20231121155337-90ade8b19d09 // indirect
148-
golang.org/x/crypto v0.31.0 // indirect
149-
golang.org/x/net v0.32.0 // indirect
148+
golang.org/x/crypto v0.32.0 // indirect
149+
golang.org/x/net v0.34.0 // indirect
150150
golang.org/x/oauth2 v0.24.0 // indirect
151-
golang.org/x/sys v0.28.0 // indirect
152-
golang.org/x/term v0.27.0 // indirect
151+
golang.org/x/sys v0.29.0 // indirect
152+
golang.org/x/term v0.28.0 // indirect
153153
golang.org/x/text v0.21.0 // indirect
154154
golang.org/x/time v0.5.0 // indirect
155155
golang.org/x/tools v0.26.0 // indirect

go.sum

+10
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7
2828
github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w=
2929
github.com/NVIDIA/go-dcgm v0.0.0-20250106155650-850266c9c8a5 h1:+HrFl/XGrOqfX8tgvJTCHfuDzbZbpdEQmbOdcDR53Ew=
3030
github.com/NVIDIA/go-dcgm v0.0.0-20250106155650-850266c9c8a5/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
31+
github.com/NVIDIA/go-dcgm v0.0.0-20250206205700-2b08cf88dcdc h1:sk7fIw8PGbElCNnc7q64vMgadgMFKSEtDtPweEyXvY4=
32+
github.com/NVIDIA/go-dcgm v0.0.0-20250206205700-2b08cf88dcdc/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
3133
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
3234
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
3335
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs=
@@ -396,6 +398,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U
396398
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
397399
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
398400
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
401+
golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc=
402+
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
399403
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
400404
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
401405
golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
@@ -408,6 +412,8 @@ golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLL
408412
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
409413
golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
410414
golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
415+
golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0=
416+
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
411417
golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE=
412418
golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
413419
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -429,8 +435,12 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc
429435
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
430436
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
431437
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
438+
golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU=
439+
golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
432440
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
433441
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
442+
golang.org/x/term v0.28.0 h1:/Ts8HFuMR2E6IP/jlo7QVLZHggjKQbhu/7H0LJFr3Gg=
443+
golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
434444
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
435445
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
436446
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=

hack/VERSION

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
OLD_DCGM_VERSION=3.3.9
2-
OLD_EXPORTER_VERSION=3.6.1
3-
NEW_DCGM_VERSION=4.0.0
4-
NEW_EXPORTER_VERSION=4.0.0
1+
OLD_DCGM_VERSION=4.1.0
2+
OLD_EXPORTER_VERSION=4.0.2
3+
NEW_DCGM_VERSION=4.1.1
4+
NEW_EXPORTER_VERSION=4.0.3

internal/pkg/transformation/kubernetes.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo devic
9696
return err
9797
}
9898

99-
podInfos, _ := deviceToPods[deviceID]
99+
podInfos := deviceToPods[deviceID]
100100
// For all containers using the GPU, extract and annotate a metric
101101
// with the container info and the shared GPU label, if it exists.
102102
// Notably, this will increase the number of unique metrics (i.e. labelsets)
@@ -203,7 +203,7 @@ func getSharedGPU(deviceID string) (string, bool) {
203203
}
204204

205205
// toDeviceToSharingPods uses the same general logic as toDeviceToPod but
206-
// allows for multiple contianers to be associated with a metric when sharing
206+
// allows for multiple containers to be associated with a metric when sharing
207207
// strategies are used in Kubernetes.
208208
// TODO(pintohuch): the logic is manually duplicated from toDeviceToPod for
209209
// better isolation and easier review. Ultimately, this logic should be

service-monitor.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ metadata:
1818
name: "dcgm-exporter"
1919
labels:
2020
app.kubernetes.io/name: "dcgm-exporter"
21-
app.kubernetes.io/version: "4.0.0"
21+
app.kubernetes.io/version: "4.0.3"
2222
spec:
2323
selector:
2424
matchLabels:
2525
app.kubernetes.io/name: "dcgm-exporter"
26-
app.kubernetes.io/version: "4.0.0"
26+
app.kubernetes.io/version: "4.0.3"
2727
endpoints:
2828
- port: "metrics"
2929
path: "/metrics"

tests/e2e/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ GO_CMD ?= go
1616
NAMESPACE ?= "dcgm-exporter"
1717
CHART ?= "./../../deployment/"
1818
IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter"
19-
IMAGE_TAG ?= "4.0.0-4.0.0-ubuntu22.04"
19+
IMAGE_TAG ?= "4.1.1-4.0.3-ubuntu22.04"
2020
KUBECONFIG ?= "~/.kube/config"
2121
RUNTIME_CLASS ?= ""
2222
NO_CLEANUP ?= "false"

0 commit comments

Comments
 (0)