@@ -20,32 +20,26 @@ import (
2020 "bytes"
2121 "context"
2222 "encoding/json"
23- "errors"
2423 "fmt"
25- "net"
26- "net/http"
2724 "os"
2825 "os/exec"
2926 "os/signal"
30- "path"
3127 "path/filepath"
3228 "sync"
3329 "syscall"
3430 "text/template"
35- "time"
3631
3732 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3833 "k8s.io/apimachinery/pkg/types"
3934 "k8s.io/klog/v2"
4035
41- "github.com/prometheus/client_golang/prometheus/promhttp"
4236 "github.com/urfave/cli/v2"
4337
4438 nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
4539 "github.com/NVIDIA/k8s-dra-driver-gpu/internal/common"
4640 "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
4741 pkgflags "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
48- daemonmetrics "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
42+ metrics "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/metrics"
4943)
5044
5145const (
@@ -234,10 +228,10 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
234228 }
235229
236230 if flags .httpEndpoint != "" {
237- if err := setupMetricsEndpoint (ctx , flags .httpEndpoint , flags .metricsPath ); err != nil {
231+ if err := metrics . RunPrometheusMetricsServer (ctx , flags .httpEndpoint , flags .metricsPath ); err != nil {
238232 return fmt .Errorf ("setup metrics endpoint: %w" , err )
239233 }
240- daemonmetrics . SetComputeDomainDaemonEnsemblePeerNodes (0 )
234+ metrics . SetComputeDomainDaemonPeerNodes (0 )
241235 }
242236
243237 // Create clientsets for Kubernetes API access
@@ -361,53 +355,16 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
361355 return nil
362356}
363357
364- func setupMetricsEndpoint (ctx context.Context , endpoint , metricsPath string ) error {
365- if metricsPath == "" {
366- return nil
367- }
368-
369- mux := http .NewServeMux ()
370- actualPath := path .Join ("/" , metricsPath )
371- mux .Handle (actualPath , promhttp .Handler ())
372- listener , err := net .Listen ("tcp" , endpoint )
373- if err != nil {
374- return fmt .Errorf ("listen on metrics endpoint: %w" , err )
375- }
376-
377- server := & http.Server {Handler : mux }
378- go func () {
379- <- ctx .Done ()
380- shutdownCtx , cancel := context .WithTimeout (context .WithoutCancel (ctx ), 30 * time .Second )
381- defer cancel ()
382- _ = server .Shutdown (shutdownCtx )
383- }()
384-
385- go func () {
386- klog .InfoS ("Starting metrics HTTP server" , "endpoint" , endpoint , "path" , actualPath )
387- err := server .Serve (listener )
388- if err != nil && ! errors .Is (err , http .ErrServerClosed ) {
389- klog .ErrorS (err , "metrics HTTP server failed" )
390- klog .FlushAndExit (klog .ExitFlushTimeout , 1 )
391- }
392- }()
393-
394- return nil
395- }
396-
397- // countEnsemblePeerNodes returns how many daemons share this clique excluding the local node.
398- func countEnsemblePeerNodes (cliqueID , localNodeName string , daemons []* nvapi.ComputeDomainDaemonInfo ) int {
358+ // countPeerNodes returns how many daemons share this clique excluding the local node.
359+ func countPeerNodes (cliqueID , localNodeName string , daemons []* nvapi.ComputeDomainDaemonInfo ) int {
399360 if cliqueID == "" {
400361 return 0
401362 }
402363 n := 0
403364 for _ , d := range daemons {
404- if d == nil || d .CliqueID != cliqueID {
405- continue
406- }
407- if d .NodeName == localNodeName {
408- continue
365+ if d != nil && d .CliqueID == cliqueID && d .NodeName != localNodeName {
366+ n ++
409367 }
410- n ++
411368 }
412369 return n
413370}
@@ -422,7 +379,7 @@ func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cl
422379 klog .Infof ("shutdown: stop IMEXDaemonUpdateLoopWithIPs" )
423380 return nil
424381 case daemons := <- controller .GetDaemonInfoUpdateChan ():
425- daemonmetrics . SetComputeDomainDaemonEnsemblePeerNodes ( countEnsemblePeerNodes (cliqueID , controller .daemonInfoManager .GetLocalNodeName (), daemons ))
382+ metrics . SetComputeDomainDaemonPeerNodes ( countPeerNodes (cliqueID , controller .daemonInfoManager .GetLocalNodeName (), daemons ))
426383 if err := writeDaemonsConfig (cliqueID , daemons ); err != nil {
427384 return fmt .Errorf ("writeDaemonsConfig failed: %w" , err )
428385 }
@@ -457,7 +414,7 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
457414 klog .Infof ("shutdown: stop IMEXDaemonUpdateLoopWithDNSNames" )
458415 return nil
459416 case daemons := <- controller .GetDaemonInfoUpdateChan ():
460- daemonmetrics . SetComputeDomainDaemonEnsemblePeerNodes ( countEnsemblePeerNodes (dnsNameManager .cliqueID , controller .daemonInfoManager .GetLocalNodeName (), daemons ))
417+ metrics . SetComputeDomainDaemonPeerNodes ( countPeerNodes (dnsNameManager .cliqueID , controller .daemonInfoManager .GetLocalNodeName (), daemons ))
461418 updated , err := dnsNameManager .UpdateDNSNameMappings (daemons )
462419 if err != nil {
463420 return fmt .Errorf ("failed to update DNS name => IP mappings: %w" , err )
0 commit comments