Skip to content

Commit 546bfe1

Browse files
authored
DCGM-Exporter 4.0.3 (#454)
Update to DCGM 4.1.1 Add support for GPU sharing metrics in k8s (@pintohutch)
1 parent ba26604 commit 546bfe1

File tree

16 files changed

+47
-35
lines changed

16 files changed

+47
-35
lines changed

.devcontainer/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,4 @@ ENV PATH $PATH:/usr/local/go/bin
7474
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
7575
# disable all constraints on the configurations required by NVIDIA container toolkit
7676
ENV NVIDIA_DISABLE_REQUIRE="true"
77-
ENV NVIDIA_VISIBLE_DEVICES=all
77+
ENV NVIDIA_VISIBLE_DEVICES=all

.devcontainer/devcontainer.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@
1717
"--security-opt",
1818
"seccomp=unconfined",
1919
"--gpus=all"
20-
]
21-
}
20+
],
21+
"postStartCommand": "docker run --privileged --rm tonistiigi/binfmt --install all"
22+
}

Makefile

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ DOCKERCMD := docker --debug buildx build
3030
MODULE := github.com/NVIDIA/dcgm-exporter
3131

3232
.PHONY: all binary install check-format local
33-
all: update-version ubuntu22.04 ubi9
33+
all: ubuntu22.04 ubi9
3434

35-
binary: update-version
35+
binary:
3636
cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
3737

3838
test-main: generate
@@ -46,7 +46,7 @@ check-format:
4646
test $$(gofmt -l pkg | tee /dev/stderr | wc -l) -eq 0
4747
test $$(gofmt -l cmd | tee /dev/stderr | wc -l) -eq 0
4848

49-
push: update-version
49+
push:
5050
$(MAKE) ubuntu22.04 OUTPUT=type=registry
5151
$(MAKE) ubi9 OUTPUT=type=registry
5252

@@ -60,13 +60,13 @@ endif
6060
ubi%: DOCKERFILE = docker/Dockerfile.ubi
6161
ubi%: --docker-build-%
6262
@
63-
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubi9
63+
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.8.0-base-ubi9
6464
ubi9: IMAGE_TAG = ubi9
6565

6666
ubuntu%: DOCKERFILE = docker/Dockerfile.ubuntu
6767
ubuntu%: --docker-build-%
6868
@
69-
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
69+
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.8.0-base-ubuntu22.04
7070
ubuntu22.04: IMAGE_TAG = ubuntu22.04
7171

7272

@@ -76,6 +76,7 @@ ubuntu22.04: IMAGE_TAG = ubuntu22.04
7676
$(DOCKERCMD) --pull \
7777
--output $(OUTPUT) \
7878
--progress=plain \
79+
--no-cache \
7980
--platform $(PLATFORMS) \
8081
--build-arg BASEIMAGE="$(BASE_IMAGE)" \
8182
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
1111
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
1212

1313
```shell
14-
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04
14+
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.3-ubuntu22.04
1515
curl localhost:9400/metrics
1616
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
1717
# TYPE DCGM_FI_DEV_SM_CLOCK gauge

dcgm-exporter.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,23 @@ metadata:
1818
name: "dcgm-exporter"
1919
labels:
2020
app.kubernetes.io/name: "dcgm-exporter"
21-
app.kubernetes.io/version: "4.0.0"
21+
app.kubernetes.io/version: "4.0.3"
2222
spec:
2323
updateStrategy:
2424
type: RollingUpdate
2525
selector:
2626
matchLabels:
2727
app.kubernetes.io/name: "dcgm-exporter"
28-
app.kubernetes.io/version: "4.0.0"
28+
app.kubernetes.io/version: "4.0.3"
2929
template:
3030
metadata:
3131
labels:
3232
app.kubernetes.io/name: "dcgm-exporter"
33-
app.kubernetes.io/version: "4.0.0"
33+
app.kubernetes.io/version: "4.0.3"
3434
name: "dcgm-exporter"
3535
spec:
3636
containers:
37-
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04"
37+
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.3-ubuntu22.04"
3838
env:
3939
- name: "DCGM_EXPORTER_LISTEN"
4040
value: ":9400"
@@ -66,11 +66,11 @@ metadata:
6666
name: "dcgm-exporter"
6767
labels:
6868
app.kubernetes.io/name: "dcgm-exporter"
69-
app.kubernetes.io/version: "4.0.0"
69+
app.kubernetes.io/version: "4.0.3"
7070
spec:
7171
selector:
7272
app.kubernetes.io/name: "dcgm-exporter"
73-
app.kubernetes.io/version: "4.0.0"
73+
app.kubernetes.io/version: "4.0.3"
7474
ports:
7575
- name: "metrics"
7676
port: 9400

deployment/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
apiVersion: v2
22
name: dcgm-exporter
33
description: A Helm chart for DCGM exporter
4-
version: "4.0.0"
4+
version: "4.0.3"
55
kubeVersion: ">= 1.19.0-0"
6-
appVersion: "4.0.0"
6+
appVersion: "4.0.3"
77
sources:
88
- https://github.com/nvidia/dcgm-exporter
99
home: https://github.com/nvidia/dcgm-exporter/

deployment/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ image:
1717
pullPolicy: IfNotPresent
1818
# Image tag defaults to AppVersion, but you can use the tag key
1919
# for the image tag, e.g:
20-
tag: 4.0.0-4.0.0-ubuntu22.04
20+
tag: 4.1.1-4.0.3-ubuntu22.04
2121

2222
# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
2323
# to stop profiling metrics from DCGM

docker/Dockerfile.ubi

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.6.3-base-ubi9
1+
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.8.0-base-ubi9
22

33
FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder
44

docker/Dockerfile.ubuntu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
1+
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.8.0-base-ubuntu22.04
22

33
FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder
44

@@ -72,7 +72,6 @@ COPY etc /etc/dcgm-exporter
7272
ENV DEBIAN_FRONTEND=noninteractive
7373
RUN echo "$TARGETARCH" && apt-get -qq update && apt-get -qq install -y --no-install-recommends \
7474
datacenter-gpu-manager-4-core libcap2-bin \
75-
&& apt-get -qq purge --autoremove -y openssl \
7675
&& apt-get -qq -y clean \
7776
&& apt-get -qq -y autoclean \
7877
&& apt-get -qq autoremove -y \

docker/dcgm-exporter-entrypoint.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env bash
22
set -euo pipefail
33

4+
ldconfig #Must refresh the cache to find libdcgm.so
45
# We want to setcap only when the container is started with the right caps
56
DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter))
67
if [ -z "$NO_SETCAP" ]; then

0 commit comments

Comments
 (0)