Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/compute-domain-kubelet-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,9 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
return nil, err
}

healthcheck, err := startHealthcheck(ctx, config)
healthcheck, err := setupHealthcheckPrimitives(ctx, config)
if err != nil {
return nil, fmt.Errorf("start healthcheck: %w", err)
return nil, fmt.Errorf("error setting up healtcheck primitives: %w", err)
}
driver.healthcheck = healthcheck

Expand Down
31 changes: 19 additions & 12 deletions cmd/compute-domain-kubelet-plugin/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,17 @@ type healthcheck struct {
draClient drapb.DRAPluginClient
}

func startHealthcheck(ctx context.Context, config *Config) (*healthcheck, error) {
func setupHealthcheckPrimitives(ctx context.Context, config *Config) (*healthcheck, error) {
Copy link
Collaborator Author

@jgehrcke jgehrcke Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

previous discussion: #633 (comment)

port := config.flags.healthcheckPort
if port < 0 {
return nil, nil
}

// Bind on all available interfaces.
addr := net.JoinHostPort("", strconv.Itoa(port))
lis, err := net.Listen("tcp", addr)
if err != nil {
return nil, fmt.Errorf("failed to listen for healthcheck service at %s: %w", addr, err)
return nil, fmt.Errorf("failed to listen on %s: %w", addr, err)
}

regSockPath := (&url.URL{
Expand All @@ -64,26 +65,28 @@ func startHealthcheck(ctx context.Context, config *Config) (*healthcheck, error)
// are enabled and the filename includes a uid.
Path: path.Join(config.flags.kubeletRegistrarDirectoryPath, DriverName+"-reg.sock"),
}).String()
klog.V(6).Infof("connecting to registration socket path=%s", regSockPath)

klog.V(6).Infof("Connect to registration socket at %s", regSockPath)
regConn, err := grpc.NewClient(
regSockPath,
grpc.WithTransportCredentials(insecure.NewCredentials()),
)
if err != nil {
return nil, fmt.Errorf("connect to registration socket: %w", err)
return nil, fmt.Errorf("error connecting to registration socket: %w", err)
}

draSockPath := (&url.URL{
Scheme: "unix",
Path: path.Join(config.DriverPluginPath(), "dra.sock"),
}).String()
klog.V(6).Infof("connecting to DRA socket path=%s", draSockPath)

klog.V(6).Infof("Connect to plugin socket at %s", draSockPath)
draConn, err := grpc.NewClient(
draSockPath,
grpc.WithTransportCredentials(insecure.NewCredentials()),
)
if err != nil {
return nil, fmt.Errorf("connect to DRA socket: %w", err)
return nil, fmt.Errorf("error connecting to plugin socket: %w", err)
}

server := grpc.NewServer()
Expand All @@ -97,9 +100,10 @@ func startHealthcheck(ctx context.Context, config *Config) (*healthcheck, error)
healthcheck.wg.Add(1)
go func() {
defer healthcheck.wg.Done()
klog.Infof("starting healthcheck service at %s", lis.Addr().String())
klog.Infof("Starting healthcheck server on %s", lis.Addr().String())
if err := server.Serve(lis); err != nil {
klog.Errorf("failed to serve healthcheck service on %s: %v", addr, err)
// Note(JP): let's review if this should be fatal
klog.Errorf("failed to start healthcheck server: %v", err)
}
}()

Expand All @@ -108,13 +112,13 @@ func startHealthcheck(ctx context.Context, config *Config) (*healthcheck, error)

func (h *healthcheck) Stop() {
if h.server != nil {
klog.Info("Stopping healthcheck service")
klog.Info("Stopping healthcheck server")
h.server.GracefulStop()
}
h.wg.Wait()
}

// Check implements [grpc_health_v1.HealthServer].
// Check implements [grpc_health_v1.HealthServer.Check].
Copy link
Collaborator Author

@jgehrcke jgehrcke Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

previous discussion: #633 (comment)

func (h *healthcheck) Check(ctx context.Context, req *grpc_health_v1.HealthCheckRequest) (*grpc_health_v1.HealthCheckResponse, error) {
knownServices := map[string]struct{}{"": {}, "liveness": {}}
if _, known := knownServices[req.GetService()]; !known {
Expand All @@ -125,16 +129,19 @@ func (h *healthcheck) Check(ctx context.Context, req *grpc_health_v1.HealthCheck
Status: grpc_health_v1.HealthCheckResponse_NOT_SERVING,
}

// This simulates the kubelet reaching out to the plugin for discovery
// (towards registering it).
info, err := h.regClient.GetInfo(ctx, &registerapi.InfoRequest{})
if err != nil {
klog.ErrorS(err, "failed to call GetInfo")
klog.ErrorS(err, "failed to call GetInfo on registration socket")
return status, nil
}
klog.V(6).Infof("Successfully invoked GetInfo: %v", info)

// This simulates the kubelet reaching out to the plugin
_, err = h.draClient.NodePrepareResources(ctx, &drapb.NodePrepareResourcesRequest{})
if err != nil {
klog.ErrorS(err, "failed to call NodePrepareResources")
klog.ErrorS(err, "failed to call NodePrepareResources on plugin socket")
return status, nil
}
klog.V(6).Info("Successfully invoked NodePrepareResources")
Expand Down