Skip to content

Commit 5f9250c

Browse files
authored
DCGM-Exporter 4.0.0 (#437)
- Update to DCGM 4.0.0 - Major refactor to enable clean mock testing - Refactor metric collection to align with prometheus best practices - Many more bug fixes and improvements
1 parent 900d465 commit 5f9250c

File tree

163 files changed

+20574
-6517
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+20574
-6517
lines changed

.devcontainer/Dockerfile

+39-31
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,39 @@
1-
FROM nvcr.io/nvidia/cuda:12.3.1-base-ubuntu22.04
2-
ARG GOLANG_VERSION=1.21.5
1+
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
2+
ARG GOLANG_VERSION=1.22.5
33
ARG USERNAME=developer
44
ARG USER_UID=1000
55
ARG USER_GID=1000
6-
ARG DCGM_VERSION=3.3.3
76
# Create a user 'developer' with UID=1000, add to 'developer' group, and add to 'sudo' group
87
RUN groupadd -g $USER_GID $USERNAME && \
9-
useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \
10-
usermod -aG sudo $USERNAME
8+
useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \
9+
usermod -aG sudo $USERNAME
1110
# Allow 'developer' to use sudo without a password
1211
RUN echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
1312

1413
RUN --mount=type=cache,target=/var/cache/apt \
1514
set -eux; \
1615
apt-get update; \
1716
apt-get install -y --no-install-recommends \
18-
git \
19-
ca-certificates \
20-
g++ \
21-
gcc \
22-
libc6-dev \
23-
make \
24-
pkg-config \
25-
wget \
26-
datacenter-gpu-manager=1:${DCGM_VERSION} \
27-
libcap2-bin \
28-
&& apt-get autoremove -y \
17+
git \
18+
ca-certificates \
19+
g++ \
20+
gcc \
21+
libc6-dev \
22+
make \
23+
pkg-config \
24+
wget \
25+
datacenter-gpu-manager-4-core \
26+
libcap2-bin \
27+
&& install -m 0755 -d /etc/apt/keyrings \
28+
&& wget -O /etc/apt/keyrings/docker.asc https://download.docker.com/linux/ubuntu/gpg \
29+
&& chmod a+r /etc/apt/keyrings/docker.asc \
30+
&& echo \
31+
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
32+
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
33+
tee /etc/apt/sources.list.d/docker.list > /dev/null \
34+
&& apt-get update \
35+
&& apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io docker-buildx-plugin \
36+
&& apt-get autoremove -y \
2937
&& rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \
3038
# DCGM exporter doesn't use libdcgm_cublas_proxy*.so.
3139
&& rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \
@@ -36,25 +44,25 @@ RUN set -eux; \
3644
url=; \
3745
echo "$arch"; \
3846
case "$arch" in \
39-
'amd64') \
40-
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \
41-
;; \
42-
'arm64') \
43-
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \
44-
;; \
45-
*) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \
47+
'amd64') \
48+
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \
49+
;; \
50+
'arm64') \
51+
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \
52+
;; \
53+
*) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \
4654
esac; \
4755
build=; \
4856
if [ -z "$url" ]; then \
49-
# https://github.com/golang/go/issues/38536#issuecomment-616897960
50-
build=1; \
51-
url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \
52-
echo >&2; \
53-
echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \
54-
echo >&2; \
57+
# https://github.com/golang/go/issues/38536#issuecomment-616897960
58+
build=1; \
59+
url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \
60+
echo >&2; \
61+
echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \
62+
echo >&2; \
5563
fi; \
56-
wget -O go.tgz "$url" --progress=dot:giga; \
57-
tar -C /usr/local -xzf go.tgz; \
64+
wget -O go.tgz "$url" --progress=dot:giga; \
65+
tar -C /usr/local -xzf go.tgz; \
5866
rm go.tgz
5967
ENV GOTOOLCHAIN=local
6068
ENV GOPATH /go

.github/workflows/go.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
- name: Set up Go
1717
uses: actions/setup-go@v2
1818
with:
19-
go-version: 1.21
19+
go-version: 1.22
2020

2121
- name: Build
2222
run: make binary

.gitignore

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ tests.cov
1111
test_results.json
1212
.scannerwork
1313
dist/
14-
.run/
14+
.run
15+
dist/
16+
1517
###############################################################################
1618
# JetBrains
1719
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
@@ -236,4 +238,4 @@ $RECYCLE.BIN/
236238
*.msp
237239

238240
# Windows shortcuts
239-
*.lnk
241+
*.lnk

.vscode/launch.json

+14-8
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,18 @@
1010
"request": "launch",
1111
"mode": "test",
1212
"program": "${workspaceFolder}/tests/e2e",
13-
"args": ["-test.v",
14-
"--ginkgo.v",
15-
"-kubeconfig","~/.kube/config",
16-
"-chart","./../../deployment/",
17-
"-image-repository","nvidia/dcgm-exporter",
18-
"-arguments","{-f=/etc/dcgm-exporter/default-counters.csv,--enable-dcgm-log=true,--dcgm-log-level=ERROR}"],
13+
"args": [
14+
"-test.v",
15+
"--ginkgo.v",
16+
"-kubeconfig",
17+
"~/.kube/config",
18+
"-chart",
19+
"./../../deployment/",
20+
"-image-repository",
21+
"nvidia/dcgm-exporter",
22+
"-arguments",
23+
"{-f=/etc/dcgm-exporter/default-counters.csv}"
24+
],
1925
"env": {},
2026
"buildFlags": "-tags=e2e"
2127
},
@@ -30,8 +36,8 @@
3036
"-f",
3137
"./etc/default-counters.csv",
3238
"--debug",
33-
"--enable-dcgm-log",
34-
"--dcgm-log-level=INFO"
39+
"-r",
40+
"localhost:5555"
3541
]
3642
}
3743
]

Jenkinsfile

-64
This file was deleted.

Makefile

+62-14
Original file line numberDiff line numberDiff line change
@@ -18,30 +18,29 @@ REGISTRY ?= nvidia
1818
GO ?= go
1919
MKDIR ?= mkdir
2020
GOLANGCILINT_TIMEOUT ?= 10m
21+
IMAGE_TAG ?= ""
2122

2223
DCGM_VERSION := $(NEW_DCGM_VERSION)
23-
GOLANG_VERSION := 1.22.5
24+
GOLANG_VERSION := 1.22.9
2425
VERSION := $(NEW_EXPORTER_VERSION)
2526
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
2627
OUTPUT := type=oci,dest=/dev/null
2728
PLATFORMS := linux/amd64,linux/arm64
28-
DOCKERCMD := docker buildx build
29+
DOCKERCMD := docker --debug buildx build
2930
MODULE := github.com/NVIDIA/dcgm-exporter
3031

31-
3232
.PHONY: all binary install check-format local
3333
all: update-version ubuntu22.04 ubi9
3434

35-
binary: generate update-version
35+
binary: update-version
3636
cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
3737

38-
test-main:
38+
test-main: generate
3939
$(GO) test ./... -short
4040

4141
install: binary
4242
install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter
4343
install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv
44-
install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv
4544

4645
check-format:
4746
test $$(gofmt -l pkg | tee /dev/stderr | wc -l) -eq 0
@@ -58,23 +57,71 @@ else
5857
$(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build'
5958
endif
6059

61-
TARGETS = ubuntu22.04 ubi9
60+
ubi%: DOCKERFILE = docker/Dockerfile.ubi
61+
ubi%: --docker-build-%
62+
@
63+
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubi9
64+
ubi9: IMAGE_TAG = ubi9
65+
66+
ubuntu%: DOCKERFILE = docker/Dockerfile.ubuntu
67+
ubuntu%: --docker-build-%
68+
@
69+
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
70+
ubuntu22.04: IMAGE_TAG = ubuntu22.04
6271

63-
DOCKERFILE.ubuntu22.04 = docker/Dockerfile.ubuntu22.04
64-
DOCKERFILE.ubi9 = docker/Dockerfile.ubi9
6572

66-
$(TARGETS):
73+
--docker-build-%:
74+
@echo "Building for $@"
75+
DOCKER_BUILDKIT=1 \
6776
$(DOCKERCMD) --pull \
6877
--output $(OUTPUT) \
78+
--progress=plain \
6979
--platform $(PLATFORMS) \
80+
--build-arg BASEIMAGE="$(BASE_IMAGE)" \
7081
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
7182
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
7283
--build-arg "VERSION=$(VERSION)" \
73-
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$@" \
74-
--file $(DOCKERFILE.$@) .
84+
--tag $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)$(if $(IMAGE_TAG),-$(IMAGE_TAG)) \
85+
--file $(DOCKERFILE) .
86+
87+
.PHONY: packages package-arm64 package-amd64
88+
packages: package-amd64 package-arm64
89+
90+
package-arm64:
91+
$(MAKE) package-build PLATFORMS=linux/arm64
92+
93+
package-amd64:
94+
$(MAKE) package-build PLATFORMS=linux/amd64
95+
96+
package-build: IMAGE_TAG = ubuntu22.04
97+
package-build:
98+
ARCH=`echo $(PLATFORMS) | cut -d'/' -f2)`; \
99+
if [ "$$ARCH" = "amd64" ]; then \
100+
ARCH="x86-64"; \
101+
fi; \
102+
if [ "$$ARCH" = "arm64" ]; then \
103+
ARCH="sbsa"; \
104+
fi; \
105+
export DIST_NAME="dcgm_exporter-linux-$$ARCH-$(VERSION)"; \
106+
export COMPONENT_NAME="dcgm_exporter"; \
107+
$(MAKE) ubuntu22.04 OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \
108+
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
109+
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin && \
110+
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/dcgm-exporter && \
111+
I=`docker create $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$(IMAGE_TAG)` && \
112+
docker cp $$I:/usr/bin/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin/ && \
113+
docker cp $$I:/etc/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/ && \
114+
cp ./LICENSE /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
115+
mkdir -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/ && \
116+
cp ./packaging/config-files/systemd/nvidia-dcgm-exporter.service \
117+
/tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/nvidia-dcgm-exporter.service && \
118+
docker rm -f $$I && \
119+
$(MKDIR) -p $(CURDIR)/dist && \
120+
cd "/tmp/$$DIST_NAME" && tar -czf $(CURDIR)/dist/$$DIST_NAME.tar.gz `ls -A` && \
121+
rm -rf "/tmp/$$DIST_NAME";
75122

76123
.PHONY: integration
77-
test-integration:
124+
test-integration: generate
78125
go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/
79126

80127
test-coverage:
@@ -83,7 +130,7 @@ test-coverage:
83130

84131
.PHONY: lint
85132
lint:
86-
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
133+
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1
87134

88135
.PHONY: validate-modules
89136
validate-modules:
@@ -99,6 +146,7 @@ tools: ## Install required tools and utilities
99146
go install github.com/axw/gocov/gocov@latest
100147
go install golang.org/x/tools/cmd/goimports@latest
101148
go install mvdan.cc/gofumpt@latest
149+
go install github.com/wadey/gocovmerge@latest
102150

103151
fmt:
104152
find . -name '*.go' | xargs gofumpt -l -w

README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
1111
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
1212

1313
```shell
14-
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.9-3.6.1-ubuntu22.04
14+
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04
1515
curl localhost:9400/metrics
1616
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
1717
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
@@ -111,8 +111,9 @@ To enable GPU-to-job mapping on the DCGM-exporter side, users must run the DCGM-
111111

112112
In order to build dcgm-exporter ensure you have the following:
113113

114-
* [Golang >= 1.21 installed](https://golang.org/)
114+
* [Golang >= 1.22 installed](https://golang.org/)
115115
* [DCGM installed](https://developer.nvidia.com/dcgm)
116+
* Have Linux machine with GPU, compatible with DCGM.
116117

117118
```shell
118119
git clone https://github.com/NVIDIA/dcgm-exporter.git

0 commit comments

Comments
 (0)