Skip to content

Commit 2bc8e4c

Browse files
authored
Merge pull request #60 from castai/feat-additional-gpu-metrics
feat: Start collecting more GPU metrics
2 parents 002c64d + ed5e5df commit 2bc8e4c

30 files changed

+1530
-347
lines changed

.mockery.yaml

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
1-
with-expecter: true
2-
dir: 'mock/{{replace .InterfaceDirRelative "internal" "" 1}}'
1+
dir: mock/{{.InterfaceDirRelative | replace "internal" "" 1}}
2+
filename: "mock_{{.InterfaceName | lower}}.go"
3+
template: testify
4+
template-data:
5+
unroll-variadic: true
36
packages:
47
"github.com/castai/gpu-metrics-exporter/internal/exporter":
58
interfaces:
6-
Exporter:
7-
Scraper:
8-
MetricMapper:
9-
HttpClient:
9+
Exporter: {}
10+
Scraper: {}
11+
MetricMapper: {}
12+
HTTPClient: {}
13+
"github.com/castai/gpu-metrics-exporter/internal/workload":
14+
interfaces:
15+
Resolver: {}
1016
"github.com/castai/gpu-metrics-exporter/internal/castai":
1117
interfaces:
12-
Client:
18+
Client: {}

README.md

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,38 @@ nv-hostengine.
2020
Make sure that these fields are exposed by DCGM exporter as metrics:
2121

2222
```
23+
DCGM_FI_DEV_GPU_TEMP
24+
DCGM_FI_DEV_MEMORY_TEMP
25+
DCGM_FI_DEV_POWER_USAGE
26+
DCGM_FI_DEV_MEM_MAX_OP_TEMP
27+
DCGM_FI_DEV_GPU_MAX_OP_TEMP
28+
DCGM_FI_DEV_SM_CLOCK
29+
DCGM_FI_DEV_GPU_UTIL
2330
DCGM_FI_PROF_SM_ACTIVE
2431
DCGM_FI_PROF_SM_OCCUPANCY
25-
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
26-
DCGM_FI_PROF_DRAM_ACTIVE
27-
DCGM_FI_PROF_PCIE_TX_BYTES
28-
DCGM_FI_PROF_PCIE_RX_BYTES
2932
DCGM_FI_PROF_GR_ENGINE_ACTIVE
30-
DCGM_FI_DEV_FB_TOTAL
33+
DCGM_FI_PROF_DRAM_ACTIVE
3134
DCGM_FI_DEV_FB_FREE
3235
DCGM_FI_DEV_FB_USED
36+
DCGM_FI_DEV_FB_TOTAL
37+
DCGM_FI_DEV_MEM_COPY_UTIL
38+
DCGM_FI_PROF_PCIE_TX_BYTES
39+
DCGM_FI_PROF_PCIE_RX_BYTES
3340
DCGM_FI_DEV_PCIE_LINK_GEN
3441
DCGM_FI_DEV_PCIE_LINK_WIDTH
35-
DCGM_FI_DEV_GPU_TEMP
36-
DCGM_FI_DEV_MEMORY_TEMP
37-
DCGM_FI_DEV_POWER_USAGE
42+
DCGM_FI_PROF_NVLINK_TX_BYTES
43+
DCGM_FI_PROF_NVLINK_RX_BYTES
44+
DCGM_FI_PROF_PIPE_INT_ACTIVE
45+
DCGM_FI_PROF_PIPE_FP16_ACTIVE
46+
DCGM_FI_PROF_PIPE_FP32_ACTIVE
47+
DCGM_FI_PROF_PIPE_FP64_ACTIVE
48+
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
49+
DCGM_FI_DEV_MIG_MODE
50+
DCGM_FI_DEV_MIG_MAX_SLICES
51+
DCGM_FI_DEV_CLOCKS_EVENT_REASONS
52+
DCGM_FI_DEV_XID_ERRORS
53+
DCGM_FI_DEV_POWER_VIOLATION
54+
DCGM_FI_DEV_THERMAL_VIOLATION
3855
```
3956

4057
## Installation

charts/gpu-metrics-exporter/templates/dcgm-exporter-configmap.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@ data:
1616
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
1717
DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned
1818
DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor
19-
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles)
2019
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %).
2120
DCGM_FI_PROF_DRAM_ACTIVE, gauge, The ratio of cycles the device memory interface is active sending or receiving data.
2221
2322
# Memory usage,,
2423
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
2524
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
2625
DCGM_FI_DEV_FB_TOTAL, gauge, Total Frame Buffer of the GPU in MB.
26+
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Utilization of the memory copy engine.
2727
2828
# PCIE,,
2929
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total number of bytes transmitted through PCIe TX
@@ -36,4 +36,15 @@ data:
3636
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipe is active.
3737
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipe is active.
3838
DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipe is active.
39+
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles)
40+
41+
# Health,,
42+
DCGM_FI_DEV_CLOCKS_EVENT_REASONS, gauge, Current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*)
43+
DCGM_FI_DEV_XID_ERRORS, gauge, The value is the specific XID error
44+
DCGM_FI_DEV_POWER_VIOLATION, gauge, Power Violation time in ns.
45+
DCGM_FI_DEV_THERMAL_VIOLATION, gauge, Thermal Violation time in ns.
46+
47+
# NVLink,,
48+
DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, The number of bytes of active NvLink tx (transmit) data including both header and payload.
49+
DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, The number of bytes of active NvLink rx (read) data including both header and payload.
3950
{{- end }}

charts/gpu-metrics-exporter/templates/move-api-key-to-secret.yaml

Lines changed: 0 additions & 44 deletions
This file was deleted.

cmd/main.go

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"fmt"
7+
"log/slog"
78
"net/http"
89
"os"
910
"os/signal"
@@ -14,14 +15,17 @@ import (
1415
"github.com/sirupsen/logrus"
1516
"k8s.io/apimachinery/pkg/labels"
1617
"k8s.io/apimachinery/pkg/selection"
17-
"k8s.io/client-go/kubernetes"
18+
"k8s.io/client-go/dynamic"
1819
"k8s.io/client-go/tools/clientcmd"
1920
"k8s.io/client-go/util/flowcontrol"
2021

2122
"github.com/castai/gpu-metrics-exporter/internal/castai"
2223
"github.com/castai/gpu-metrics-exporter/internal/config"
2324
"github.com/castai/gpu-metrics-exporter/internal/exporter"
2425
"github.com/castai/gpu-metrics-exporter/internal/server"
26+
"github.com/castai/gpu-metrics-exporter/internal/workload"
27+
"github.com/castai/logging"
28+
"github.com/castai/metrics"
2529
)
2630

2731
var (
@@ -30,6 +34,11 @@ var (
3034
Version = "local"
3135
)
3236

37+
const (
38+
workloadCacheSize = 512
39+
workloadsLabelKey = "workloads.cast.ai/custom-workload"
40+
)
41+
3342
func main() {
3443
log := logrus.New()
3544

@@ -38,19 +47,30 @@ func main() {
3847
log.Fatal(err)
3948
}
4049

41-
logLevel, err := logrus.ParseLevel(cfg.LogLevel)
50+
logLevel, err := parseLogLevel(cfg.LogLevel)
4251
if err != nil {
43-
log.Fatal(err)
52+
log.Warnf("failed to parse log level, defaulting to 'info': %v", err)
53+
logLevel = slog.LevelInfo
4454
}
45-
log.SetLevel(logLevel)
4655

47-
if err := run(cfg, log); err != nil && !errors.Is(err, context.Canceled) {
56+
castaiLogger := logging.New(logging.NewTextHandler(logging.TextHandlerConfig{
57+
Output: os.Stdout,
58+
Level: logLevel,
59+
}))
60+
61+
if err := run(cfg, castaiLogger); err != nil && !errors.Is(err, context.Canceled) {
4862
log.Fatal(err)
4963
}
5064
}
5165

52-
func run(cfg *config.Config, log logrus.FieldLogger) error {
53-
mux := server.NewServerMux(log)
66+
func parseLogLevel(level string) (slog.Level, error) {
67+
var lvl slog.Level
68+
err := lvl.UnmarshalText([]byte(level))
69+
return lvl, err
70+
}
71+
72+
func run(cfg *config.Config, log *logging.Logger) error {
73+
mux := server.NewServerMux()
5474

5575
srv := &http.Server{
5676
Addr: fmt.Sprintf(":%d", cfg.HTTPListenPort),
@@ -75,19 +95,45 @@ func run(cfg *config.Config, log logrus.FieldLogger) error {
7595
cancel()
7696
}()
7797

78-
clientset, err := newKubernetesClientset(cfg)
98+
dynClient, err := newDynamicClient(cfg)
7999
if err != nil {
80-
log.Fatal(err)
100+
log.WithField("error", err.Error()).Fatal("failed to create kubernetes dynamic client")
81101
}
82102

83103
labelSelector, err := selectorFromMap(cfg.DCGMLabels)
84104
if err != nil {
85-
log.Fatal(err)
105+
log.WithField("error", err.Error()).Fatal("failed to create get label selector")
106+
}
107+
108+
metricClient, err := metrics.NewMetricClient(
109+
metrics.Config{
110+
APIAddr: cfg.TelemetryURL,
111+
APIToken: cfg.APIKey,
112+
ClusterID: cfg.ClusterID,
113+
}, log)
114+
if err != nil {
115+
log.WithField("error", err.Error()).Warn("failed to create metrics client")
116+
}
117+
118+
if metricClient != nil {
119+
go func() {
120+
if err := metricClient.Start(ctx); err != nil && !errors.Is(err, context.Canceled) {
121+
log.WithField("error", err.Error()).Error("error in metrics client")
122+
}
123+
}()
86124
}
87125

88126
client := setupCastAIClient(log, cfg)
89127
scraper := exporter.NewScraper(&http.Client{}, log)
90-
mapper := exporter.NewMapper(cfg.NodeName)
128+
workloadResolver, err := workload.NewResolver(dynClient, workload.Config{
129+
LabelKeys: []string{workloadsLabelKey},
130+
CacheSize: workloadCacheSize,
131+
})
132+
if err != nil {
133+
log.WithField("error", err.Error()).Fatal("failed to create workload resolver")
134+
}
135+
136+
mapper := exporter.NewMapper(cfg.NodeName, workloadResolver, log)
91137
ex := exporter.NewExporter(exporter.Config{
92138
ExportInterval: cfg.ExportInterval,
93139
Selector: labelSelector.String(),
@@ -96,7 +142,7 @@ func run(cfg *config.Config, log logrus.FieldLogger) error {
96142
DCGMExporterHost: cfg.DCGMHost,
97143
Enabled: true,
98144
NodeName: cfg.NodeName,
99-
}, clientset, log, scraper, mapper, client)
145+
}, dynClient, log, scraper, mapper, client, metricClient)
100146

101147
go func() {
102148
if err := ex.Start(ctx); err != nil && !errors.Is(err, context.Canceled) {
@@ -108,19 +154,14 @@ func run(cfg *config.Config, log logrus.FieldLogger) error {
108154
return srv.ListenAndServe()
109155
}
110156

111-
func newKubernetesClientset(cfg *config.Config) (*kubernetes.Clientset, error) {
112-
config, err := clientcmd.BuildConfigFromFlags("", cfg.KubeConfigPath)
113-
if err != nil {
114-
return nil, err
115-
}
116-
config.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(float32(10), 25)
117-
118-
clientset, err := kubernetes.NewForConfig(config)
157+
func newDynamicClient(cfg *config.Config) (dynamic.Interface, error) {
158+
restConfig, err := clientcmd.BuildConfigFromFlags("", cfg.KubeConfigPath)
119159
if err != nil {
120160
return nil, err
121161
}
162+
restConfig.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(float32(10), 25)
122163

123-
return clientset, nil
164+
return dynamic.NewForConfig(restConfig)
124165
}
125166

126167
func selectorFromMap(labelMap map[string]string) (labels.Selector, error) {
@@ -138,7 +179,7 @@ func selectorFromMap(labelMap map[string]string) (labels.Selector, error) {
138179
return selector.Add(requirements...), nil
139180
}
140181

141-
func setupCastAIClient(log logrus.FieldLogger, cfg *config.Config) castai.Client {
182+
func setupCastAIClient(log *logging.Logger, cfg *config.Config) castai.Client {
142183
clientConfig := castai.Config{
143184
ClusterID: cfg.ClusterID,
144185
APIKey: cfg.APIKey,

gen_mockery.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
//go:generate go run github.com/vektra/mockery/v2@v2.42.0 --all
1+
//go:generate go run github.com/vektra/mockery/v3@v3.5.3
22
package mockery

0 commit comments

Comments
 (0)