Skip to content

Commit dd3001a

Browse files
DCGM Exporter Release 3.3.6-3.4.2 (#325)
Signed-off-by: Rohit Arora <[email protected]> Co-authored-by: Vadym Fedorov <[email protected]>
1 parent 7decfd2 commit dd3001a

37 files changed

+1337
-201
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ vendor/
1010
tests.cov
1111
test_results.json
1212
.scannerwork
13+
dist/
1314

1415
###############################################################################
1516
# JetBrains

Makefile

+23-20
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,32 @@
1414

1515
include hack/VERSION
1616

17-
MKDIR ?= mkdir
18-
REGISTRY ?= nvidia
17+
REGISTRY ?= nvidia
18+
GO ?= go
19+
MKDIR ?= mkdir
1920
GOLANGCILINT_TIMEOUT ?= 10m
2021

2122
DCGM_VERSION := $(NEW_DCGM_VERSION)
2223
GOLANG_VERSION := 1.21.5
2324
VERSION := $(NEW_EXPORTER_VERSION)
2425
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
25-
OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar
26+
OUTPUT := type=oci,dest=/dev/null
2627
PLATFORMS := linux/amd64,linux/arm64
2728
DOCKERCMD := docker buildx build
2829
MODULE := github.com/NVIDIA/dcgm-exporter
2930

31+
3032
.PHONY: all binary install check-format local
3133
all: update-version ubuntu22.04 ubi9
3234

33-
binary: update-version
34-
cd cmd/dcgm-exporter; go build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
35+
binary: generate update-version
36+
cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
3537

3638
test-main:
37-
go test ./... -short
39+
$(GO) test ./... -short
3840

3941
install: binary
40-
install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter
42+
install -m 755 $(DIST_DIR)/dcgm-exporter /usr/bin/dcgm-exporter
4143
install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv
4244
install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv
4345

@@ -56,24 +58,20 @@ else
5658
$(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build'
5759
endif
5860

59-
ubuntu22.04:
60-
$(DOCKERCMD) --pull \
61-
--output $(OUTPUT) \
62-
--platform $(PLATFORMS) \
63-
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
64-
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
65-
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu22.04" \
66-
--file docker/Dockerfile.ubuntu22.04 .
61+
TARGETS = ubuntu22.04 ubi9
6762

68-
ubi9:
63+
DOCKERFILE.ubuntu22.04 = docker/Dockerfile.ubuntu22.04
64+
DOCKERFILE.ubi9 = docker/Dockerfile.ubi9
65+
66+
$(TARGETS):
6967
$(DOCKERCMD) --pull \
7068
--output $(OUTPUT) \
7169
--platform $(PLATFORMS) \
7270
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
7371
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
74-
--build-arg "VERSION=$(FULL_VERSION)" \
75-
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi9" \
76-
--file docker/Dockerfile.ubi9 .
72+
--build-arg "VERSION=$(VERSION)" \
73+
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$@" \
74+
--file $(DOCKERFILE.$@) .
7775

7876
.PHONY: integration
7977
test-integration:
@@ -84,7 +82,7 @@ test-coverage:
8482

8583
.PHONY: lint
8684
lint:
87-
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --verbose
85+
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
8886

8987
.PHONY: validate-modules
9088
validate-modules:
@@ -132,3 +130,8 @@ update-version:
132130

133131
# Update DCGM and DCGM Exporter versions
134132
update-versions: update-version
133+
134+
.PHONY: generate
135+
# Generate code (Mocks)
136+
generate:
137+
go generate ./...

README.md

+61-33
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
99
### Quickstart
1010

1111
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
12-
```
13-
$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.1-ubuntu22.04
14-
$ curl localhost:9400/metrics
12+
13+
```shell
14+
docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04
15+
curl localhost:9400/metrics
1516
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
1617
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
1718
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
@@ -32,33 +33,38 @@ Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-ope
3233
Ensure you have already setup your cluster with the [default runtime as NVIDIA](https://github.com/NVIDIA/nvidia-container-runtime#docker-engine-setup).
3334

3435
The recommended way to install DCGM-Exporter is to use the Helm chart:
35-
```
36-
$ helm repo add gpu-helm-charts \
36+
37+
```shell
38+
helm repo add gpu-helm-charts \
3739
https://nvidia.github.io/dcgm-exporter/helm-charts
3840
```
41+
3942
Update the repo:
43+
44+
```shell
45+
helm repo update
4046
```
41-
$ helm repo update
42-
```
47+
4348
And install the chart:
44-
```
45-
$ helm install \
49+
50+
```shell
51+
helm install \
4652
--generate-name \
4753
gpu-helm-charts/dcgm-exporter
4854
```
4955

5056
Once the `dcgm-exporter` pod is deployed, you can use port forwarding to obtain metrics quickly:
5157

52-
53-
```
54-
$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml
58+
```shell
59+
kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml
5560

5661
# Let's get the output of a random pod:
57-
$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \
62+
NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \
5863
-o "jsonpath={ .items[0].metadata.name}")
5964

60-
$ kubectl port-forward $NAME 8080:9400 &
61-
$ curl -sL http://127.0.0.1:8080/metrics
65+
kubectl port-forward $NAME 8080:9400 &
66+
67+
curl -sL http://127.0.0.1:8080/metrics
6268
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
6369
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
6470
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
@@ -72,33 +78,50 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"
7278
...
7379

7480
```
81+
7582
To integrate DCGM-Exporter with Prometheus and Grafana, see the full instructions in the [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/latest/).
7683
`dcgm-exporter` is deployed as part of the GPU Operator. To get started with integrating with Prometheus, check the Operator [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#gpu-telemetry).
7784

7885
### TLS and Basic Auth
7986

8087
Exporter supports TLS and basic auth using [exporter-toolkit](https://github.com/prometheus/exporter-toolkit). To use TLS and/or basic auth, users need to use `--web-config-file` CLI flag as follows
8188

82-
```
89+
```shell
8390
dcgm-exporter --web-config-file=web-config.yaml
8491
```
8592

8693
A sample `web-config.yaml` file can be fetched from [exporter-toolkit repository](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-config.yml). The reference of the `web-config.yaml` file can be consulted in the [docs](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md).
8794

95+
### How to include HPC jobs in metric labels
96+
97+
The DCGM-exporter can include High-Performance Computing (HPC) job information into its metric labels. To achieve this, HPC environment administrators must configure their HPC environment to generate files that map GPUs to HPC jobs.
98+
99+
#### File Conventions
100+
101+
These mapping files follow a specific format:
102+
103+
* Each file is named after a unique GPU ID (e.g., 0, 1, 2, etc.).
104+
* Each line in the file contains JOB IDs that run on the corresponding GPU.
105+
106+
#### Enabling HPC Job Mapping on DCGM-Exporter
107+
108+
To enable GPU-to-job mapping on the DCGM-exporter side, users must run the DCGM-exporter with the --hpc-job-mapping-dir command-line parameter, pointing to a directory where the HPC cluster creates job mapping files. Or, users can set the environment variable DCGM_HPC_JOB_MAPPING_DIR to achieve the same result.
109+
88110
### Building from Source
89111

90112
In order to build dcgm-exporter ensure you have the following:
91-
- [Golang >= 1.21 installed](https://golang.org/)
92-
- [DCGM installed](https://developer.nvidia.com/dcgm)
93113

94-
```
95-
$ git clone https://github.com/NVIDIA/dcgm-exporter.git
96-
$ cd dcgm-exporter
97-
$ make binary
98-
$ sudo make install
114+
* [Golang >= 1.21 installed](https://golang.org/)
115+
* [DCGM installed](https://developer.nvidia.com/dcgm)
116+
117+
```shell
118+
git clone https://github.com/NVIDIA/dcgm-exporter.git
119+
cd dcgm-exporter
120+
make binary
121+
sudo make install
99122
...
100-
$ dcgm-exporter &
101-
$ curl localhost:9400/metrics
123+
dcgm-exporter &
124+
curl localhost:9400/metrics
102125
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
103126
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
104127
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
@@ -118,6 +141,7 @@ With `dcgm-exporter` you can configure which fields are collected by specifying
118141
You will find the default CSV file under `etc/default-counters.csv` in the repository, which is copied on your system or container to `/etc/dcgm-exporter/default-counters.csv`
119142

120143
The layout and format of this file is as follows:
144+
121145
```
122146
# Format
123147
# If line starts with a '#' it is considered a comment
@@ -129,39 +153,43 @@ DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
129153
```
130154

131155
A custom csv file can be specified using the `-f` option or `--collectors` as follows:
132-
```
133-
$ dcgm-exporter -f /tmp/custom-collectors.csv
156+
157+
```shell
158+
dcgm-exporter -f /tmp/custom-collectors.csv
134159
```
135160

136161
Notes:
137-
- Always make sure your entries have 2 commas (',')
138-
- The complete list of counters that can be collected can be found on the DCGM API reference manual: https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html
162+
163+
* Always make sure your entries have 2 commas (',')
164+
* The complete list of counters that can be collected can be found on the DCGM API reference manual: <https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html>
139165

140166
### What about a Grafana Dashboard?
141167

142-
You can find the official NVIDIA DCGM-Exporter dashboard here: https://grafana.com/grafana/dashboards/12239
168+
You can find the official NVIDIA DCGM-Exporter dashboard here: <https://grafana.com/grafana/dashboards/12239>
143169

144170
You will also find the `json` file on this repo under `grafana/dcgm-exporter-dashboard.json`
145171

146172
Pull requests are accepted!
147173

148-
149174
### Building the containers
150175

151176
This project uses [docker buildx](https://docs.docker.com/buildx/working-with-buildx/) for multi-arch image creation. Follow the instructions on that page to get a working builder instance for creating these containers. Some other useful build options follow.
152177

153178
Builds local images based on the machine architecture and makes them available in 'docker images'
179+
154180
```
155181
make local
156182
```
157183

158184
Build the ubuntu image and export to 'docker images'
159-
```
185+
186+
```shell
160187
make ubuntu22.04 PLATFORMS=linux/amd64 OUTPUT=type=docker
161188
```
162189

163190
Build and push the images to some other 'private_registry'
164-
```
191+
192+
```shell
165193
make REGISTRY=<private_registry> push
166194
```
167195

dcgm-exporter.yaml

+6-6
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,23 @@ metadata:
1818
name: "dcgm-exporter"
1919
labels:
2020
app.kubernetes.io/name: "dcgm-exporter"
21-
app.kubernetes.io/version: "3.4.1"
21+
app.kubernetes.io/version: "3.4.2"
2222
spec:
2323
updateStrategy:
2424
type: RollingUpdate
2525
selector:
2626
matchLabels:
2727
app.kubernetes.io/name: "dcgm-exporter"
28-
app.kubernetes.io/version: "3.4.1"
28+
app.kubernetes.io/version: "3.4.2"
2929
template:
3030
metadata:
3131
labels:
3232
app.kubernetes.io/name: "dcgm-exporter"
33-
app.kubernetes.io/version: "3.4.1"
33+
app.kubernetes.io/version: "3.4.2"
3434
name: "dcgm-exporter"
3535
spec:
3636
containers:
37-
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.1-ubuntu22.04"
37+
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04"
3838
env:
3939
- name: "DCGM_EXPORTER_LISTEN"
4040
value: ":9400"
@@ -64,11 +64,11 @@ metadata:
6464
name: "dcgm-exporter"
6565
labels:
6666
app.kubernetes.io/name: "dcgm-exporter"
67-
app.kubernetes.io/version: "3.4.1"
67+
app.kubernetes.io/version: "3.4.2"
6868
spec:
6969
selector:
7070
app.kubernetes.io/name: "dcgm-exporter"
71-
app.kubernetes.io/version: "3.4.1"
71+
app.kubernetes.io/version: "3.4.2"
7272
ports:
7373
- name: "metrics"
7474
port: 9400

deployment/Chart.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
apiVersion: v2
22
name: dcgm-exporter
33
description: A Helm chart for DCGM exporter
4-
version: "3.4.1"
4+
version: "3.4.2"
55
kubeVersion: ">= 1.19.0-0"
6-
appVersion: "3.4.1"
6+
appVersion: "3.4.2"
77
sources:
88
- https://github.com/nvidia/dcgm-exporter
99
home: https://github.com/nvidia/dcgm-exporter/

deployment/values.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ image:
1717
pullPolicy: IfNotPresent
1818
# Image tag defaults to AppVersion, but you can use the tag key
1919
# for the image tag, e.g:
20-
tag: 3.3.5-3.4.1-ubuntu22.04
20+
tag: 3.3.6-3.4.2-ubuntu22.04
2121

2222
# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
2323
# to stop profiling metrics from DCGM

0 commit comments

Comments
 (0)