Skip to content

Commit 691c927

Browse files
authored
DCGM-Exporter 4.5.3-4.8.2 (#656)
1 parent 2623f7a commit 691c927

46 files changed

Lines changed: 1283 additions & 874 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.devcontainer/Dockerfile

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
FROM nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04
2-
ARG GOLANG_VERSION=1.24.13
1+
FROM nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04
2+
ARG GOLANG_VERSION=1.26.2
33
ARG USERNAME=developer
44
ARG USER_UID=1000
55
ARG USER_GID=1000
@@ -83,12 +83,21 @@ RUN set -eux; \
8383
\
8484
tar -C /usr/local -xzf go.tgz; \
8585
rm go.tgz
86-
ENV GOTOOLCHAIN=local
86+
# GOTOOLCHAIN=auto lets Go honour `toolchain` directives in go.mod, auto-
87+
# fetching the matching version if the baked-in compiler is older. Costs
88+
# one toolchain download per fresh build cache, then nothing.
89+
ENV GOTOOLCHAIN=auto
8790
ENV GOPATH=/go
8891
ENV PATH=$GOPATH/bin:$PATH
8992
RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH"
9093
ENV PATH=$PATH:/usr/local/go/bin
9194

95+
ARG UV_VERSION=0.11.7
96+
RUN curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | sh \
97+
&& mv /root/.local/bin/uv /usr/local/bin/uv \
98+
&& mv /root/.local/bin/uvx /usr/local/bin/uvx \
99+
&& uv --version
100+
92101
# Required for DCGM metrics
93102
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
94103
# disable all constraints on the configurations required by NVIDIA container toolkit

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
*.swp
22
*.swo
33
dcgm-exporter
4+
.go/
5+
.cursor/
46
!etc/
57
!deployment/
68
.env
@@ -9,6 +11,7 @@ dcgm-exporter
911
vendor/
1012
tests.cov
1113
test_results.json
14+
.coverdata/
1215
.scannerwork
1316
dist/
1417
.run

.hadolint.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# Ignored rules with justification:
55
# - DL3008/DL3041: Package version pinning not used because:
66
# * We intentionally use the latest DCGM version available in NVIDIA repos
7-
# * Version control is provided by the versioned CUDA base image (e.g., cuda:13.1.1)
7+
# * Version control is provided by the versioned CUDA base image (e.g., cuda:13.2.0)
88
# * Allows automatic security patches and bug fixes within compatible versions
99
# * Pinning would require Dockerfile updates for every DCGM patch release
1010
# * Build tools (wget, gcc) are ephemeral and don't affect final image

Makefile

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@ include hack/VERSION
1616

1717
REGISTRY ?= nvidia
1818
GO ?= go
19+
GOBIN_DIR := $(or $(shell $(GO) env GOBIN),$(shell $(GO) env GOPATH)/bin)
1920
MKDIR ?= mkdir
2021
GOLANGCILINT_TIMEOUT ?= 10m
2122
IMAGE_TAG ?= ""
2223

24+
export PATH := $(GOBIN_DIR):$(PATH)
25+
2326
DCGM_VERSION := $(NEW_DCGM_VERSION)
24-
GOLANG_VERSION := 1.24.13
27+
GOLANG_VERSION := 1.26.2
2528
VERSION := $(NEW_EXPORTER_VERSION)
2629
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
2730
OUTPUT := type=oci,dest=/dev/null
@@ -63,14 +66,14 @@ ubi%: DOCKERFILE = docker/Dockerfile
6366
ubi%: BUILD_TARGET = runtime-ubi
6467
ubi%: --docker-build-%
6568
@
66-
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:13.1.1-base-ubi9
69+
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:13.2.1-base-ubi9
6770
ubi9: IMAGE_TAG = ubi9
6871

6972
ubuntu%: DOCKERFILE = docker/Dockerfile
7073
ubuntu%: BUILD_TARGET = runtime-ubuntu
7174
ubuntu%: --docker-build-%
7275
@
73-
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04
76+
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04
7477
ubuntu22.04: IMAGE_TAG = ubuntu22.04
7578

7679
distroless: DOCKERFILE = docker/Dockerfile
@@ -80,6 +83,7 @@ distroless: --docker-build-distroless
8083

8184
--docker-build-%:
8285
@echo "Building for $@ with target $(BUILD_TARGET)"
86+
mkdir -p .go/compiler .go/pkg/mod
8387
docker buildx inspect
8488
DOCKER_BUILDKIT=1 \
8589
$(DOCKERCMD) --pull \
@@ -92,6 +96,9 @@ distroless: --docker-build-distroless
9296
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
9397
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
9498
--build-arg "VERSION=$(VERSION)" \
99+
$(if $(GOPROXY),--build-arg "GOPROXY=$(GOPROXY)") \
100+
$(if $(GONOSUMDB),--build-arg "GONOSUMDB=$(GONOSUMDB)") \
101+
$(if $(GOSUMDB),--build-arg "GOSUMDB=$(GOSUMDB)") \
95102
--tag $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)$(if $(IMAGE_TAG),-$(IMAGE_TAG)) \
96103
--file $(DOCKERFILE) .
97104

@@ -104,18 +111,27 @@ package-arm64:
104111
package-amd64:
105112
$(MAKE) package-build PLATFORMS=linux/amd64
106113

114+
ifeq ($(GOPROXY_ENABLED),true)
115+
package-build: BUILD_TYPE = distroless
116+
package-build: IMAGE_TAG = distroless
117+
DIST_PREFIX = stig-
118+
else
119+
package-build: BUILD_TYPE = ubuntu22.04
107120
package-build: IMAGE_TAG = ubuntu22.04
121+
DIST_PREFIX =
122+
endif
123+
108124
package-build:
109-
ARCH=`echo $(PLATFORMS) | cut -d'/' -f2)`; \
125+
ARCH=`echo $(PLATFORMS) | cut -d'/' -f2`; \
110126
if [ "$$ARCH" = "amd64" ]; then \
111127
ARCH="x86-64"; \
112128
fi; \
113129
if [ "$$ARCH" = "arm64" ]; then \
114130
ARCH="sbsa"; \
115131
fi; \
116-
export DIST_NAME="dcgm_exporter-linux-$$ARCH-$(VERSION)"; \
132+
export DIST_NAME="dcgm_exporter-$(DIST_PREFIX)linux-$$ARCH-$(VERSION)"; \
117133
export COMPONENT_NAME="dcgm_exporter"; \
118-
$(MAKE) ubuntu22.04 OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \
134+
$(MAKE) $(BUILD_TYPE) OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \
119135
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
120136
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin && \
121137
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/dcgm-exporter && \
@@ -135,26 +151,58 @@ package-build:
135151
test-integration: generate
136152
go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/
137153

154+
.PHONY: test-coverage
138155
test-coverage:
156+
@echo "Preparing coverage data directories..."
157+
@rm -rf .coverdata
158+
@mkdir -p .coverdata/unit .coverdata/integration .coverdata/merged
139159
@echo "Running unit tests..."
140160
gotestsum --format testname -- \
141-
$$(go list ./... | grep -v "/tests/e2e/") \
161+
$$($(GO) list ./... | grep -v "/tests/e2e/") \
142162
-count=1 -timeout 5m \
143-
-covermode=count \
144-
-coverprofile=unit_coverage.out \
145-
--short
163+
-cover -covermode=count \
164+
--short \
165+
-args -test.gocoverdir=$(CURDIR)/.coverdata/unit
146166
@echo "Running integration tests..."
147167
gotestsum --format testname -- \
148168
./internal/pkg/integration_test/... \
149169
-count=1 -timeout 5m \
150-
-covermode=count \
170+
-cover -covermode=count \
151171
-coverpkg=./internal/pkg/... \
152-
-coverprofile=integration_coverage.out \
172+
--short \
173+
-args -test.gocoverdir=$(CURDIR)/.coverdata/integration
174+
@echo "Merging coverage data..."
175+
$(GO) tool covdata merge \
176+
-i=$(CURDIR)/.coverdata/unit,$(CURDIR)/.coverdata/integration \
177+
-o=$(CURDIR)/.coverdata/merged
178+
@echo "Coverage summary (pre-filter):"
179+
$(GO) tool covdata percent -i=$(CURDIR)/.coverdata/merged
180+
$(GO) tool covdata textfmt \
181+
-i=$(CURDIR)/.coverdata/merged \
182+
-o=combined_coverage.out.tmp
183+
grep -v "mock_" combined_coverage.out.tmp > tests.cov
184+
rm -rf combined_coverage.out.tmp .coverdata
185+
$(GO) tool cover -func=tests.cov
186+
187+
# Unit tests only with coverage (for CI without GPU/DCGM)
188+
# Skips integration tests that require DCGM library
189+
# Skips nvmlprovider tests that require NVML library (GPU)
190+
# Emits a single coverage profile directly (no merge step)
191+
# Generates test_results.json for SonarQube integration
192+
.PHONY: unit-test-coverage
193+
unit-test-coverage:
194+
@echo "Running unit tests only (skipping integration tests and nvmlprovider)..."
195+
gotestsum --format testname --jsonfile test_results.json -- \
196+
$$(go list ./... | grep -v -E "(tests/e2e|integration_test|nvmlprovider)") \
197+
-count=1 -timeout 5m \
198+
-covermode=count \
199+
-coverprofile=tests.cov \
153200
--short
154-
@echo "Merging coverage profiles..."
155-
gocovmerge unit_coverage.out integration_coverage.out > combined_coverage.out.tmp
156-
cat combined_coverage.out.tmp | grep -v "mock_" > tests.cov
157-
rm combined_coverage.out.tmp integration_coverage.out unit_coverage.out
201+
@echo "Filtering out mock files from coverage..."
202+
@if [ -f tests.cov ]; then \
203+
grep -v "mock_" tests.cov > tests.cov.tmp && mv tests.cov.tmp tests.cov || true; \
204+
fi
205+
@echo "Unit test coverage completed"
158206
go tool cover -func=tests.cov
159207

160208
.PHONY: lint
@@ -194,22 +242,21 @@ validate: validate-modules hadolint check-fmt ## Run all validation checks
194242

195243
.PHONY: tools
196244
tools: ## Install required tools and utilities
197-
curl -sSfL https://golangci-lint.run/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v2.8.0
198-
go install golang.org/x/tools/cmd/goimports@v0.41.0
199-
go install mvdan.cc/gofumpt@v0.9.2
200-
go install github.com/wadey/gocovmerge@v0.0.0-20160331181800-b5bfa59ec0ad
201-
go install gotest.tools/gotestsum@v1.13.0
245+
curl -sSfL https://golangci-lint.run/install.sh | sh -s -- -b $(GOBIN_DIR) v2.11.4
246+
$(GO) install golang.org/x/tools/cmd/goimports@v0.44.0
247+
$(GO) install mvdan.cc/gofumpt@v0.9.2
248+
$(GO) install gotest.tools/gotestsum@v1.13.0
202249

203250
fmt:
204-
find . -name '*.go' | xargs gofumpt -l -w
251+
find . -path './.go' -prune -o -name '*.go' -print | xargs gofumpt -l -w
205252

206253
goimports:
207254
go list -f {{.Dir}} $(MODULE)/... \
208255
| xargs goimports -local $(MODULE) -w
209256

210257
check-fmt:
211258
@echo "Checking code formatting. Any listed files don't match goimports:"
212-
! (find . -iname "*.go" \
259+
! (find . -path './.go' -prune -o -path './internal/mocks' -prune -o -path './third_party' -prune -o -path './examples' -prune -o -iname "*.go" -print \
213260
| xargs goimports -l -local $(MODULE) | grep .)
214261

215262
.PHONY: e2e-test

README.md

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
1111
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
1212

1313
```shell
14-
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
14+
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless
1515
curl localhost:9400/metrics
1616
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
1717
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
@@ -92,6 +92,35 @@ dcgm-exporter --web-config-file=web-config.yaml
9292

9393
A sample `web-config.yaml` file can be fetched from [exporter-toolkit repository](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-config.yml). The reference of the `web-config.yaml` file can be consulted in the [docs](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md).
9494

95+
### IPv6 Support
96+
97+
DCGM-Exporter supports IPv6 addresses for both the remote hostengine connection (`-r`) and the metrics listen address (`-a`). IPv6 addresses must use bracket notation when combined with a port.
98+
99+
#### Remote Hostengine (CLI)
100+
101+
```shell
102+
dcgm-exporter -r "[::1]:5555"
103+
```
104+
105+
#### Remote Hostengine (Environment Variable)
106+
107+
```shell
108+
export DCGM_REMOTE_HOSTENGINE_INFO="[::1]:5555"
109+
dcgm-exporter
110+
```
111+
112+
#### Metrics Listen Address
113+
114+
```shell
115+
dcgm-exporter -a "[::]:9400"
116+
```
117+
118+
**Note:** The brackets in `[::1]:5555` are required by the DCGM connection protocol. When using the CLI, the shell requires quoting (double or single quotes) around the address to prevent bracket interpretation.
119+
120+
#### Prerequisites
121+
122+
The remote `nv-hostengine` must be configured to listen on IPv6. Refer to the [DCGM documentation](https://docs.nvidia.com/datacenter/dcgm/latest/) for configuring `nv-hostengine` bind address options.
123+
95124
### How to include HPC jobs in metric labels
96125

97126
The DCGM-exporter can include High-Performance Computing (HPC) job information into its metric labels. To achieve this, HPC environment administrators must configure their HPC environment to generate files that map GPUs to HPC jobs.
@@ -164,6 +193,10 @@ Notes:
164193
* Always make sure your entries have 2 commas (',')
165194
* The complete list of counters that can be collected can be found on the DCGM API reference manual: <https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html>
166195

196+
### Profiling Metrics
197+
198+
Please note that for Ampere and earlier generation GPUs, profiling metrics depend on the datacenter-gpu-manager-4-proprietary package. This package is included in the container.
199+
167200
### What about a Grafana Dashboard?
168201

169202
You can find the official NVIDIA DCGM-Exporter dashboard here: <https://grafana.com/grafana/dashboards/12239>

dcgm-exporter.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,24 @@ metadata:
1818
name: "dcgm-exporter"
1919
labels:
2020
app.kubernetes.io/name: "dcgm-exporter"
21-
app.kubernetes.io/version: "4.8.1"
21+
app.kubernetes.io/version: "4.8.2"
2222
spec:
2323
updateStrategy:
2424
type: RollingUpdate
2525
selector:
2626
matchLabels:
2727
app.kubernetes.io/name: "dcgm-exporter"
28-
app.kubernetes.io/version: "4.8.1"
28+
app.kubernetes.io/version: "4.8.2"
2929
template:
3030
metadata:
3131
labels:
3232
app.kubernetes.io/name: "dcgm-exporter"
33-
app.kubernetes.io/version: "4.8.1"
33+
app.kubernetes.io/version: "4.8.2"
3434
name: "dcgm-exporter"
3535
spec:
3636
automountServiceAccountToken: false
3737
containers:
38-
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless"
38+
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless"
3939
env:
4040
- name: "DCGM_EXPORTER_LISTEN"
4141
value: ":9400"
@@ -83,11 +83,11 @@ metadata:
8383
name: "dcgm-exporter"
8484
labels:
8585
app.kubernetes.io/name: "dcgm-exporter"
86-
app.kubernetes.io/version: "4.8.1"
86+
app.kubernetes.io/version: "4.8.2"
8787
spec:
8888
selector:
8989
app.kubernetes.io/name: "dcgm-exporter"
90-
app.kubernetes.io/version: "4.8.1"
90+
app.kubernetes.io/version: "4.8.2"
9191
ports:
9292
- name: "metrics"
9393
port: 9400

deployment/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
apiVersion: v2
22
name: dcgm-exporter
33
description: A Helm chart for DCGM exporter
4-
version: "4.8.1"
4+
version: "4.8.2"
55
kubeVersion: ">= 1.19.0-0"
6-
appVersion: "4.8.1"
6+
appVersion: "4.8.2"
77
sources:
88
- https://github.com/nvidia/dcgm-exporter
99
home: https://github.com/nvidia/dcgm-exporter/

deployment/templates/metrics-configmap.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ data:
5050
# Memory usage
5151
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
5252
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
53-
DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB).
5453
5554
# ECC
5655
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
@@ -79,9 +78,6 @@ data:
7978
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
8079
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
8180
82-
# Static configuration information. These appear as labels on the other metrics
83-
DCGM_FI_DRIVER_VERSION, label, Driver Version
84-
8581
# DCP metrics
8682
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active.
8783
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned.

deployment/templates/service-monitor.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ spec:
3737
scrapeTimeout: "{{ .Values.serviceMonitor.scrapeTimeout }}"
3838
honorLabels: {{ .Values.serviceMonitor.honorLabels }}
3939
relabelings:
40-
{{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }}
40+
{{- toYaml .Values.serviceMonitor.relabelings | nindent 6 }}
4141
metricRelabelings:
42-
{{ toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }}
42+
{{- toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }}
4343
{{- end -}}

0 commit comments

Comments
 (0)