Skip to content

Commit 0d74968

Browse files
committed
Abstract Prometheus server start
Signed-off-by: Sheng Lin <shelin@nvidia.com>
1 parent 5a94361 commit 0d74968

File tree

10 files changed

+118
-167
lines changed

10 files changed

+118
-167
lines changed

cmd/compute-domain-controller/cdclique.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,16 +146,16 @@ func (m *ComputeDomainCliqueManager) Update(ctx context.Context, clique *nvapi.C
146146

147147
func attachComputeDomainCliqueMetricsHandlers(informer cache.SharedIndexInformer) error {
148148
_, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
149-
AddFunc: observeCompueDomainCliqueMetrics,
149+
AddFunc: observeComputeDomainCliqueMetrics,
150150
UpdateFunc: func(_, newObj any) {
151-
observeCompueDomainCliqueMetrics(newObj)
151+
observeComputeDomainCliqueMetrics(newObj)
152152
},
153153
DeleteFunc: forgetComputeDomainCliqueMetrics,
154154
})
155155
return err
156156
}
157157

158-
func observeCompueDomainCliqueMetrics(obj any) {
158+
func observeComputeDomainCliqueMetrics(obj any) {
159159
clique, ok := obj.(*nvapi.ComputeDomainClique)
160160
if !ok {
161161
return

cmd/compute-domain-controller/main.go

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@ import (
2727
"path"
2828
"syscall"
2929

30-
"github.com/prometheus/client_golang/prometheus"
31-
"github.com/prometheus/client_golang/prometheus/promhttp"
3230
"github.com/urfave/cli/v2"
3331

3432
"k8s.io/component-base/logs"
35-
"k8s.io/component-base/metrics/legacyregistry"
3633
"k8s.io/klog/v2"
3734

3835
_ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration
@@ -43,6 +40,7 @@ import (
4340
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/info"
4441
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
4542
pkgflags "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
43+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
4644
)
4745

4846
const (
@@ -255,24 +253,9 @@ func newApp() *cli.App {
255253

256254
func SetupHTTPEndpoint(config *Config) error {
257255
if config.flags.metricsPath != "" {
258-
// To collect metrics data from the metric handler itself, we
259-
// let it register itself and then collect from that registry.
260-
reg := prometheus.NewRegistry()
261-
gatherers := prometheus.Gatherers{
262-
// Include Go runtime and process metrics:
263-
// https://github.com/kubernetes/kubernetes/blob/9780d88cb6a4b5b067256ecb4abf56892093ee87/staging/src/k8s.io/component-base/metrics/legacyregistry/registry.go#L46-L49
264-
legacyregistry.DefaultGatherer,
265-
}
266-
gatherers = append(gatherers, reg)
267-
268256
actualPath := path.Join("/", config.flags.metricsPath)
269257
klog.InfoS("Starting metrics", "path", actualPath)
270-
// This is similar to k8s.io/component-base/metrics HandlerWithReset
271-
// except that we gather from multiple sources.
272-
config.mux.Handle(actualPath,
273-
promhttp.InstrumentMetricHandler(
274-
reg,
275-
promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{})))
258+
config.mux.Handle(path.Join("/", config.flags.metricsPath), metrics.NewLegacyPrometheusHandler())
276259
}
277260

278261
if config.flags.profilePath != "" {

cmd/compute-domain-daemon/main.go

Lines changed: 9 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -20,32 +20,26 @@ import (
2020
"bytes"
2121
"context"
2222
"encoding/json"
23-
"errors"
2423
"fmt"
25-
"net"
26-
"net/http"
2724
"os"
2825
"os/exec"
2926
"os/signal"
30-
"path"
3127
"path/filepath"
3228
"sync"
3329
"syscall"
3430
"text/template"
35-
"time"
3631

3732
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3833
"k8s.io/apimachinery/pkg/types"
3934
"k8s.io/klog/v2"
4035

41-
"github.com/prometheus/client_golang/prometheus/promhttp"
4236
"github.com/urfave/cli/v2"
4337

4438
nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
4539
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/common"
4640
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
4741
pkgflags "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
48-
daemonmetrics "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
42+
metrics "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
4943
)
5044

5145
const (
@@ -234,10 +228,10 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
234228
}
235229

236230
if flags.httpEndpoint != "" {
237-
if err := setupMetricsEndpoint(ctx, flags.httpEndpoint, flags.metricsPath); err != nil {
231+
if err := metrics.RunPrometheusMetricsServer(ctx, flags.httpEndpoint, flags.metricsPath); err != nil {
238232
return fmt.Errorf("setup metrics endpoint: %w", err)
239233
}
240-
daemonmetrics.SetComputeDomainDaemonEnsemblePeerNodes(0)
234+
metrics.SetComputeDomainDaemonPeerNodes(0)
241235
}
242236

243237
// Create clientsets for Kubernetes API access
@@ -361,53 +355,16 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
361355
return nil
362356
}
363357

364-
func setupMetricsEndpoint(ctx context.Context, endpoint, metricsPath string) error {
365-
if metricsPath == "" {
366-
return nil
367-
}
368-
369-
mux := http.NewServeMux()
370-
actualPath := path.Join("/", metricsPath)
371-
mux.Handle(actualPath, promhttp.Handler())
372-
listener, err := net.Listen("tcp", endpoint)
373-
if err != nil {
374-
return fmt.Errorf("listen on metrics endpoint: %w", err)
375-
}
376-
377-
server := &http.Server{Handler: mux}
378-
go func() {
379-
<-ctx.Done()
380-
shutdownCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
381-
defer cancel()
382-
_ = server.Shutdown(shutdownCtx)
383-
}()
384-
385-
go func() {
386-
klog.InfoS("Starting metrics HTTP server", "endpoint", endpoint, "path", actualPath)
387-
err := server.Serve(listener)
388-
if err != nil && !errors.Is(err, http.ErrServerClosed) {
389-
klog.ErrorS(err, "metrics HTTP server failed")
390-
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
391-
}
392-
}()
393-
394-
return nil
395-
}
396-
397-
// countEnsemblePeerNodes returns how many daemons share this clique excluding the local node.
398-
func countEnsemblePeerNodes(cliqueID, localNodeName string, daemons []*nvapi.ComputeDomainDaemonInfo) int {
358+
// countPeerNodes returns how many daemons share this clique excluding the local node.
359+
func countPeerNodes(cliqueID, localNodeName string, daemons []*nvapi.ComputeDomainDaemonInfo) int {
399360
if cliqueID == "" {
400361
return 0
401362
}
402363
n := 0
403364
for _, d := range daemons {
404-
if d == nil || d.CliqueID != cliqueID {
405-
continue
406-
}
407-
if d.NodeName == localNodeName {
408-
continue
365+
if d != nil && d.CliqueID == cliqueID && d.NodeName != localNodeName {
366+
n++
409367
}
410-
n++
411368
}
412369
return n
413370
}
@@ -422,7 +379,7 @@ func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cl
422379
klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithIPs")
423380
return nil
424381
case daemons := <-controller.GetDaemonInfoUpdateChan():
425-
daemonmetrics.SetComputeDomainDaemonEnsemblePeerNodes(countEnsemblePeerNodes(cliqueID, controller.daemonInfoManager.GetLocalNodeName(), daemons))
382+
metrics.SetComputeDomainDaemonPeerNodes(countPeerNodes(cliqueID, controller.daemonInfoManager.GetLocalNodeName(), daemons))
426383
if err := writeDaemonsConfig(cliqueID, daemons); err != nil {
427384
return fmt.Errorf("writeDaemonsConfig failed: %w", err)
428385
}
@@ -457,7 +414,7 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
457414
klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithDNSNames")
458415
return nil
459416
case daemons := <-controller.GetDaemonInfoUpdateChan():
460-
daemonmetrics.SetComputeDomainDaemonEnsemblePeerNodes(countEnsemblePeerNodes(dnsNameManager.cliqueID, controller.daemonInfoManager.GetLocalNodeName(), daemons))
417+
metrics.SetComputeDomainDaemonPeerNodes(countPeerNodes(dnsNameManager.cliqueID, controller.daemonInfoManager.GetLocalNodeName(), daemons))
461418
updated, err := dnsNameManager.UpdateDNSNameMappings(daemons)
462419
if err != nil {
463420
return fmt.Errorf("failed to update DNS name => IP mappings: %w", err)

cmd/compute-domain-kubelet-plugin/main.go

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,11 @@ import (
2020
"context"
2121
"errors"
2222
"fmt"
23-
"net"
24-
"net/http"
2523
"os"
2624
"os/signal"
27-
"path"
2825
"path/filepath"
2926
"syscall"
30-
"time"
3127

32-
"github.com/prometheus/client_golang/prometheus/promhttp"
3328
"github.com/urfave/cli/v2"
3429

3530
"k8s.io/component-base/logs"
@@ -40,6 +35,7 @@ import (
4035
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/info"
4136
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
4237
pkgflags "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
38+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
4339
)
4440

4541
const (
@@ -259,7 +255,7 @@ func RunPlugin(ctx context.Context, config *Config) error {
259255
defer cancel()
260256

261257
if config.flags.httpEndpoint != "" {
262-
if err := setupMetricsEndpoint(ctx, config.flags.httpEndpoint, config.flags.metricsPath); err != nil {
258+
if err := metrics.RunPrometheusMetricsServer(ctx, config.flags.httpEndpoint, config.flags.metricsPath); err != nil {
263259
return fmt.Errorf("setup metrics endpoint: %w", err)
264260
}
265261
}
@@ -285,39 +281,6 @@ func RunPlugin(ctx context.Context, config *Config) error {
285281
return nil
286282
}
287283

288-
func setupMetricsEndpoint(ctx context.Context, endpoint, metricsPath string) error {
289-
if metricsPath == "" {
290-
return nil
291-
}
292-
293-
mux := http.NewServeMux()
294-
actualPath := path.Join("/", metricsPath)
295-
mux.Handle(actualPath, promhttp.Handler())
296-
listener, err := net.Listen("tcp", endpoint)
297-
if err != nil {
298-
return fmt.Errorf("listen on metrics endpoint: %w", err)
299-
}
300-
301-
server := &http.Server{Handler: mux}
302-
go func() {
303-
<-ctx.Done()
304-
shutdownCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
305-
defer cancel()
306-
_ = server.Shutdown(shutdownCtx)
307-
}()
308-
309-
go func() {
310-
klog.InfoS("Starting metrics HTTP server", "endpoint", endpoint, "path", actualPath)
311-
err := server.Serve(listener)
312-
if err != nil && !errors.Is(err, http.ErrServerClosed) {
313-
klog.ErrorS(err, "metrics HTTP server failed")
314-
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
315-
}
316-
}()
317-
318-
return nil
319-
}
320-
321284
// setNvidiaCDIHookPath ensures the proper flag is set with the host path for the nvidia-cdi-hook binary.
322285
// If 'f.nvidiaCDIHookPath' is already set (from the command line), do nothing.
323286
// If 'f.nvidiaCDIHookPath' is empty, it copies the nvidia-cdi-hook binary from

cmd/gpu-kubelet-plugin/device_state.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,6 @@ func syncPreparedDevicesGaugeFromCheckpoint(nodeName string, cp *Checkpoint) {
12141214
}
12151215

12161216
for dt, count := range counts {
1217-
// draPreparedDevices.WithLabelValues(DriverName, dt).Set(float64(count))
12181217
drametrics.SetPreparedDevicesCounts(nodeName, DriverName, dt, count)
12191218
}
12201219
}

cmd/gpu-kubelet-plugin/main.go

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,11 @@ import (
2020
"context"
2121
"errors"
2222
"fmt"
23-
"net"
24-
"net/http"
2523
"os"
2624
"os/signal"
27-
"path"
2825
"path/filepath"
2926
"syscall"
30-
"time"
3127

32-
"github.com/prometheus/client_golang/prometheus/promhttp"
3328
"github.com/urfave/cli/v2"
3429

3530
"k8s.io/component-base/logs"
@@ -40,6 +35,7 @@ import (
4035
"github.com/NVIDIA/k8s-dra-driver-gpu/internal/info"
4136
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
4237
pkgflags "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
38+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
4339
)
4440

4541
const (
@@ -274,7 +270,7 @@ func RunPlugin(ctx context.Context, config *Config) error {
274270
defer cancel()
275271

276272
if config.flags.httpEndpoint != "" {
277-
if err := setupMetricsEndpoint(ctx, config.flags.httpEndpoint, config.flags.metricsPath); err != nil {
273+
if err := metrics.RunPrometheusMetricsServer(ctx, config.flags.httpEndpoint, config.flags.metricsPath); err != nil {
278274
return fmt.Errorf("setup metrics endpoint: %w", err)
279275
}
280276
}
@@ -300,39 +296,6 @@ func RunPlugin(ctx context.Context, config *Config) error {
300296
return nil
301297
}
302298

303-
func setupMetricsEndpoint(ctx context.Context, endpoint, metricsPath string) error {
304-
if metricsPath == "" {
305-
return nil
306-
}
307-
308-
mux := http.NewServeMux()
309-
actualPath := path.Join("/", metricsPath)
310-
mux.Handle(actualPath, promhttp.Handler())
311-
listener, err := net.Listen("tcp", endpoint)
312-
if err != nil {
313-
return fmt.Errorf("listen on metrics endpoint: %w", err)
314-
}
315-
316-
server := &http.Server{Handler: mux}
317-
go func() {
318-
<-ctx.Done()
319-
shutdownCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
320-
defer cancel()
321-
_ = server.Shutdown(shutdownCtx)
322-
}()
323-
324-
go func() {
325-
klog.InfoS("Starting metrics HTTP server", "endpoint", endpoint, "path", actualPath)
326-
err := server.Serve(listener)
327-
if err != nil && !errors.Is(err, http.ErrServerClosed) {
328-
klog.ErrorS(err, "metrics HTTP server failed")
329-
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
330-
}
331-
}()
332-
333-
return nil
334-
}
335-
336299
// change to config
337300
// If 'f.nvidiaCDIHookPath' is already set (from the command line), do nothing.
338301
// If 'f.nvidiaCDIHookPath' is empty, it copies the nvidia-cdi-hook binary from

pkg/metrics/compute_domain_daemon.go

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,29 @@ import (
2020
"sync"
2121

2222
"github.com/prometheus/client_golang/prometheus"
23+
"k8s.io/component-base/metrics/legacyregistry"
2324
)
2425

2526
var (
2627
computeDomainDaemonMetricsOnce sync.Once
27-
ensemblePeerNodes prometheus.Gauge
28+
peerNodes prometheus.Gauge
2829
)
2930

3031
func registerComputeDomainDaemonMetrics() {
3132
computeDomainDaemonMetricsOnce.Do(func() {
32-
ensemblePeerNodes = prometheus.NewGauge(
33+
peerNodes = prometheus.NewGauge(
3334
prometheus.GaugeOpts{
3435
Namespace: "nvidia_gpu_dra",
35-
Name: "compute_domain_daemon_ensemble_peer_nodes",
36-
Help: "Number of other nodes in the same IMEX clique as this daemon, as last reported by ComputeDomain status or ComputeDomainClique (excludes this node's daemon entry).",
36+
Name: "compute_domain_daemon_peer_nodes",
37+
Help: "Number of other nodes in the same ComputeDomain clique as this daemon, as last reported by ComputeDomain status or ComputeDomainClique (excludes this node's daemon entry).",
3738
},
3839
)
39-
prometheus.MustRegister(ensemblePeerNodes)
40+
legacyregistry.RawMustRegister(peerNodes)
4041
})
4142
}
4243

43-
// SetComputeDomainDaemonEnsemblePeerNodes updates the ensemble peer gauge (other clique members only).
44-
func SetComputeDomainDaemonEnsemblePeerNodes(n int) {
44+
// SetComputeDomainDaemonPeerNodes updates the ensemble peer gauge (other clique members only).
45+
func SetComputeDomainDaemonPeerNodes(n int) {
4546
registerComputeDomainDaemonMetrics()
46-
if n < 0 {
47-
n = 0
48-
}
49-
ensemblePeerNodes.Set(float64(n))
47+
peerNodes.Set(float64(n))
5048
}

0 commit comments

Comments
 (0)