Skip to content

Commit afebe5b

Browse files
committed
Add DRA request Prometheus metrics
Signed-off-by: Sheng Lin <shelin@nvidia.com>
1 parent 2510b35 commit afebe5b

File tree

17 files changed

+548
-24
lines changed

17 files changed

+548
-24
lines changed

cmd/compute-domain-controller/computedomain.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"k8s.io/klog/v2"
2929

3030
nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
31+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
3132
nvinformers "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/nvidia.com/informers/externalversions"
3233
nvlisters "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/nvidia.com/listers/resource/v1beta1"
3334
)
@@ -207,6 +208,8 @@ func (m *ComputeDomainManager) UpdateStatus(ctx context.Context, cd *nvapi.Compu
207208
// Recalculate global status based on current state
208209
cd.Status.Status = m.calculateGlobalStatus(cd)
209210

211+
metrics.ObserveComputeDomainStatus(string(cd.UID), cd.Status.Status)
212+
210213
updatedCD, err := m.config.clientsets.Nvidia.ResourceV1beta1().ComputeDomains(cd.Namespace).UpdateStatus(ctx, cd, metav1.UpdateOptions{})
211214
if err != nil {
212215
return nil, err
@@ -344,6 +347,7 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
344347
return fmt.Errorf("error removing finalizer: %w", err)
345348
}
346349

350+
metrics.ForgetComputeDomain(string(cd.UID))
347351
return nil
348352
}
349353

cmd/compute-domain-controller/controller.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ type ManagerConfig struct {
5656
// logVerbosityCDDaemon controls the log verbosity for dynamically launched
5757
// ComputeDomain daemons.
5858
logVerbosityCDDaemon int
59+
60+
// httpEndpoint is the TCP network address where the HTTP server for diagnostics
61+
// (including pprof and metrics) will listen
62+
httpEndpoint string
63+
64+
// metricsPath is the HTTP path for Prometheus metrics
65+
metricsPath string
5966
}
6067

6168
// Controller manages the lifecycle of the DRA driver and its components.
@@ -84,6 +91,8 @@ func (c *Controller) Run(ctx context.Context) error {
8491
clientsets: c.config.clientsets,
8592
workQueue: workQueue,
8693
logVerbosityCDDaemon: c.config.flags.logVerbosityCDDaemon,
94+
httpEndpoint: c.config.flags.httpEndpoint,
95+
metricsPath: c.config.flags.metricsPath,
8796
}
8897

8998
// TODO: log full, nested cliFlags structure.

cmd/compute-domain-controller/main.go

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@ import (
2727
"path"
2828
"syscall"
2929

30-
"github.com/prometheus/client_golang/prometheus"
31-
"github.com/prometheus/client_golang/prometheus/promhttp"
3230
"github.com/urfave/cli/v2"
3331

3432
"k8s.io/component-base/logs"
35-
"k8s.io/component-base/metrics/legacyregistry"
3633
"k8s.io/klog/v2"
3734

3835
_ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration
@@ -43,6 +40,7 @@ import (
4340
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/info"
4441
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
4542
pkgflags "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
43+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
4644
)
4745

4846
const (
@@ -255,24 +253,9 @@ func newApp() *cli.App {
255253

256254
func SetupHTTPEndpoint(config *Config) error {
257255
if config.flags.metricsPath != "" {
258-
// To collect metrics data from the metric handler itself, we
259-
// let it register itself and then collect from that registry.
260-
reg := prometheus.NewRegistry()
261-
gatherers := prometheus.Gatherers{
262-
// Include Go runtime and process metrics:
263-
// https://github.com/kubernetes/kubernetes/blob/9780d88cb6a4b5b067256ecb4abf56892093ee87/staging/src/k8s.io/component-base/metrics/legacyregistry/registry.go#L46-L49
264-
legacyregistry.DefaultGatherer,
265-
}
266-
gatherers = append(gatherers, reg)
267-
268256
actualPath := path.Join("/", config.flags.metricsPath)
269257
klog.InfoS("Starting metrics", "path", actualPath)
270-
// This is similar to k8s.io/component-base/metrics HandlerWithReset
271-
// except that we gather from multiple sources.
272-
config.mux.Handle(actualPath,
273-
promhttp.InstrumentMetricHandler(
274-
reg,
275-
promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{})))
258+
config.mux.Handle(path.Join("/", config.flags.metricsPath), metrics.NewLegacyPrometheusHandler())
276259
}
277260

278261
if config.flags.profilePath != "" {

cmd/compute-domain-daemon/controller.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ type DaemonInfoManager interface {
3636

3737
// ManagerConfig holds the configuration for the compute domain manager.
3838
type ManagerConfig struct {
39+
httpEndpoint string
40+
metricsPath string
3941
workQueue *workqueue.WorkQueue
4042
clientsets flags.ClientSets
4143
nodeName string
@@ -52,6 +54,8 @@ type ManagerConfig struct {
5254

5355
// ControllerConfig holds the configuration for the controller.
5456
type ControllerConfig struct {
57+
httpEndpoint string
58+
metricsPath string
5559
clientsets flags.ClientSets
5660
nodeName string
5761
computeDomainUUID string
@@ -77,6 +81,8 @@ func NewController(config *ControllerConfig) (*Controller, error) {
7781

7882
mc := &ManagerConfig{
7983
workQueue: workQueue,
84+
httpEndpoint: config.httpEndpoint,
85+
metricsPath: config.metricsPath,
8086
clientsets: config.clientsets,
8187
nodeName: config.nodeName,
8288
computeDomainUUID: config.computeDomainUUID,

cmd/compute-domain-daemon/main.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ type Flags struct {
6161
podName string
6262
podNamespace string
6363
maxNodesPerIMEXDomain int
64+
httpEndpoint string
65+
metricsPath string
6466
klogVerbosity int
6567
}
6668

@@ -236,6 +238,8 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
236238
}
237239

238240
config := &ControllerConfig{
241+
httpEndpoint: flags.httpEndpoint,
242+
metricsPath: flags.metricsPath,
239243
clientsets: clientsets,
240244
cliqueID: flags.cliqueID,
241245
computeDomainUUID: flags.computeDomainUUID,
@@ -330,6 +334,20 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
330334
return nil
331335
}
332336

337+
// countPeerNodes returns how many daemons share this clique excluding the local node.
338+
func countPeerNodes(cliqueID, localNodeName string, daemons []*nvapi.ComputeDomainDaemonInfo) int {
339+
if cliqueID == "" {
340+
return 0
341+
}
342+
n := 0
343+
for _, d := range daemons {
344+
if d != nil && d.CliqueID == cliqueID && d.NodeName != localNodeName {
345+
n++
346+
}
347+
}
348+
return n
349+
}
350+
333351
// IMEXDaemonUpdateLoopWithIPs reacts to ComputeDomain status changes by updating the
334352
// IMEX daemon nodes config file and (re)starting the IMEX daemon process.
335353
func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cliqueID string, pm *ProcessManager) error {

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
configapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
3535
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/common"
3636
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
37+
drametrics "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
3738
)
3839

3940
type OpaqueDeviceConfig struct {
@@ -132,6 +133,11 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
132133
for _, c := range checkpoints {
133134
if c == DriverPluginCheckpointFileBasename {
134135
klog.Infof("Found previous checkpoint: %s", c)
136+
cp, err := state.getCheckpoint()
137+
if err != nil {
138+
return nil, fmt.Errorf("unable to get checkpoint: %w", err)
139+
}
140+
syncPreparedDevicesGaugeFromCheckpoint(config.flags.nodeName, cp)
135141
return state, nil
136142
}
137143
}
@@ -288,7 +294,11 @@ func (s *DeviceState) Unprepare(ctx context.Context, claimRef kubeletplugin.Name
288294
}
289295

290296
func (s *DeviceState) createCheckpoint(cp *Checkpoint) error {
291-
return s.checkpointManager.CreateCheckpoint(DriverPluginCheckpointFileBasename, cp)
297+
if err := s.checkpointManager.CreateCheckpoint(DriverPluginCheckpointFileBasename, cp); err != nil {
298+
return err
299+
}
300+
syncPreparedDevicesGaugeFromCheckpoint(s.config.flags.nodeName, cp)
301+
return nil
292302
}
293303

294304
func (s *DeviceState) getCheckpoint() (*Checkpoint, error) {
@@ -317,6 +327,7 @@ func (s *DeviceState) updateCheckpoint(mutate func(*Checkpoint)) error {
317327
return fmt.Errorf("unable to create checkpoint: %w", err)
318328
}
319329

330+
syncPreparedDevicesGaugeFromCheckpoint(s.config.flags.nodeName, checkpoint)
320331
return nil
321332
}
322333

@@ -825,3 +836,34 @@ func (s *DeviceState) validateNoOverlappingPreparedDevices(checkpoint *Checkpoin
825836
}
826837
return nil
827838
}
839+
840+
func syncPreparedDevicesGaugeFromCheckpoint(nodeName string, cp *Checkpoint) {
841+
counts := make(map[string]int)
842+
if cp == nil {
843+
return
844+
}
845+
lv := cp.ToLatestVersion()
846+
if lv != nil && lv.V2 != nil {
847+
for _, pc := range lv.V2.PreparedClaims {
848+
if pc.CheckpointState != ClaimCheckpointStatePrepareCompleted {
849+
continue
850+
}
851+
for _, g := range pc.PreparedDevices {
852+
for _, dev := range g.Devices {
853+
if _, ok := counts[dev.Type()]; !ok {
854+
counts[dev.Type()] = 0
855+
}
856+
counts[dev.Type()]++
857+
}
858+
}
859+
}
860+
}
861+
862+
for _, dt := range []string{ComputeDomainChannelType, ComputeDomainDaemonType, UnknownDeviceType} {
863+
if count, ok := counts[dt]; !ok {
864+
drametrics.SetPreparedDevicesCounts(nodeName, DriverName, dt, 0)
865+
} else {
866+
drametrics.SetPreparedDevicesCounts(nodeName, DriverName, dt, count)
867+
}
868+
}
869+
}

cmd/compute-domain-kubelet-plugin/driver.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"k8s.io/klog/v2"
3434

3535
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flock"
36+
drametrics "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
3637
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/workqueue"
3738
)
3839

@@ -241,14 +242,19 @@ func (d *driver) HandleError(ctx context.Context, err error, msg string) {
241242
// also reflect an error. Set the boolean to `true` for any result wrapping a
242243
// non-retryable error.
243244
func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.ResourceClaim) (bool, kubeletplugin.PrepareResult) {
245+
t0 := time.Now()
246+
244247
release, err := d.pulock.Acquire(ctx, flock.WithTimeout(10*time.Second))
245248
if err != nil {
249+
drametrics.IncNodePrepareError(DriverName, "lock_acquire")
246250
res := kubeletplugin.PrepareResult{
247251
Err: fmt.Errorf("error acquiring prep/unprep lock: %w", err),
248252
}
249253
return false, res
250254
}
251255
defer release()
256+
doneInFlight := drametrics.TrackInFlight(DriverName, "prepare")
257+
defer doneInFlight()
252258

253259
if claim.Status.Allocation == nil {
254260
res := kubeletplugin.PrepareResult{
@@ -259,6 +265,7 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.Res
259265

260266
devs, err := d.state.Prepare(ctx, claim)
261267
if err != nil {
268+
drametrics.IncNodePrepareError(DriverName, "prepare_devices")
262269
res := kubeletplugin.PrepareResult{
263270
Err: fmt.Errorf("error preparing devices for claim '%s': %w", ResourceClaimToString(claim), err),
264271
}
@@ -270,24 +277,32 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.Res
270277
}
271278

272279
klog.V(1).Infof("Prepared devices for claim '%s': %v", ResourceClaimToString(claim), devs)
280+
drametrics.ObserveRequest(DriverName, "prepare", time.Since(t0))
273281

274282
return true, kubeletplugin.PrepareResult{Devices: devs}
275283
}
276284

277285
// Return 2-tuple: the first value is a boolean indicating to the retry logic
278286
// whether the work is 'done'.
279287
func (d *driver) nodeUnprepareResource(ctx context.Context, claimRef kubeletplugin.NamespacedObject) (bool, error) {
288+
tstart := time.Now()
289+
280290
release, err := d.pulock.Acquire(ctx, flock.WithTimeout(10*time.Second))
281291
if err != nil {
292+
drametrics.IncNodeUnprepareError(DriverName, "lock_acquire")
282293
return false, fmt.Errorf("error acquiring prep/unprep lock: %w", err)
283294
}
284295
defer release()
296+
doneInFlight := drametrics.TrackInFlight(DriverName, "unprepare")
297+
defer doneInFlight()
285298

286299
if err := d.state.Unprepare(ctx, claimRef); err != nil {
300+
drametrics.IncNodeUnprepareError(DriverName, "unprepare_devices")
287301
return isPermanentError(err), fmt.Errorf("error unpreparing devices for claim '%v': %w", claimRef.String(), err)
288302
}
289303

290304
klog.V(1).Infof("Unprepared devices for claim '%v'", claimRef.String())
305+
drametrics.ObserveRequest(DriverName, "unprepare", time.Since(tstart))
291306
return true, nil
292307
}
293308

cmd/compute-domain-kubelet-plugin/main.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ import (
2727

2828
"github.com/urfave/cli/v2"
2929

30+
"k8s.io/component-base/logs"
3031
"k8s.io/dynamic-resource-allocation/kubeletplugin"
3132
"k8s.io/klog/v2"
3233

33-
"k8s.io/component-base/logs"
34-
3534
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/common"
3635
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/info"
3736
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
3837
pkgflags "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
38+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
3939
)
4040

4141
const (
@@ -48,6 +48,8 @@ type Flags struct {
4848
kubeClientConfig pkgflags.KubeClientConfig
4949

5050
nodeName string
51+
httpEndpoint string
52+
metricsPath string
5153
namespace string
5254
cdiRoot string
5355
containerDriverRoot string
@@ -95,6 +97,19 @@ func newApp() *cli.App {
9597
Destination: &flags.namespace,
9698
EnvVars: []string{"NAMESPACE"},
9799
},
100+
&cli.StringFlag{
101+
Name: "http-endpoint",
102+
Usage: "The TCP network `address` where the metrics HTTP server will listen (example: `:8080`). The default is the empty string, which means the server is disabled.",
103+
Destination: &flags.httpEndpoint,
104+
EnvVars: []string{"HTTP_ENDPOINT"},
105+
},
106+
&cli.StringFlag{
107+
Name: "metrics-path",
108+
Usage: "The HTTP `path` where Prometheus metrics are exposed, disabled if empty.",
109+
Value: "/metrics",
110+
Destination: &flags.metricsPath,
111+
EnvVars: []string{"METRICS_PATH"},
112+
},
98113
&cli.StringFlag{
99114
Name: "cdi-root",
100115
Usage: "Absolute path to the directory where CDI files will be generated.",
@@ -239,6 +254,12 @@ func RunPlugin(ctx context.Context, config *Config) error {
239254
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
240255
defer cancel()
241256

257+
if config.flags.httpEndpoint != "" {
258+
if err := metrics.RunPrometheusMetricsServer(ctx, config.flags.httpEndpoint, config.flags.metricsPath); err != nil {
259+
return fmt.Errorf("setup metrics endpoint: %w", err)
260+
}
261+
}
262+
242263
// Create and start the driver
243264
driver, err := NewDriver(ctx, config)
244265
if err != nil {

0 commit comments

Comments
 (0)