Skip to content

Commit f935349

Browse files
Add webhook service endpoint readiness check before creating default PtpOperatorConfig
The operator could fail to create the default PtpOperatorConfig on startup due to a race condition: the local webhook server is ready but the Kubernetes Service endpoints are not yet populated, causing the API server's validating webhook call to fail with "no endpoints available". Without retry, the default config is never created and the linuxptp-daemon DaemonSet is never spawned. Extend waitForWebhookServer to also verify the webhook-service is reachable via cluster DNS (using a TCP dial) before proceeding to create the default config. This ensures the endpoint controller has had time to populate the Service endpoints. Use goroutine-local err variables to avoid a data race with mgr.Start, and check ctx.Done() so the goroutine terminates cleanly on manager shutdown. Generated-by: Cursor
1 parent 158f56c commit f935349

File tree

1 file changed

+48
-23
lines changed

1 file changed

+48
-23
lines changed

main.go

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"crypto/tls"
2222
"flag"
2323
"fmt"
24+
"net"
2425
"net/http"
2526
"os"
2627
"strings"
@@ -246,17 +247,14 @@ func main() {
246247
}
247248

248249
go func() {
249-
// Wait until the webhook server is ready.
250250
setupLog.Info("waiting for validating webhook to be ready")
251-
err = waitForWebhookServer(checker)
252-
if err != nil {
251+
if err := waitForWebhookServer(ctx, checker); err != nil {
253252
setupLog.Error(err, "unable to create default PtpOperatorConfig due to webhook not ready")
254-
} else {
255-
// create default before the webhook are setup
256-
err = createDefaultOperatorConfig(ctrl.GetConfigOrDie())
257-
if err != nil {
258-
setupLog.Error(err, "unable to create default PtpOperatorConfig")
259-
}
253+
return
254+
}
255+
256+
if err := createDefaultOperatorConfig(ctx, restConfig); err != nil {
257+
setupLog.Error(err, "unable to create default PtpOperatorConfig")
260258
}
261259
}()
262260
setupLog.Info("starting manager")
@@ -267,7 +265,7 @@ func main() {
267265

268266
}
269267

270-
func createDefaultOperatorConfig(cfg *rest.Config) error {
268+
func createDefaultOperatorConfig(ctx context.Context, cfg *rest.Config) error {
271269
logger := setupLog.WithName("createDefaultOperatorConfig")
272270
c, err := client.New(cfg, client.Options{Scheme: scheme})
273271
if err != nil {
@@ -278,15 +276,15 @@ func createDefaultOperatorConfig(cfg *rest.Config) error {
278276
DaemonNodeSelector: map[string]string{},
279277
},
280278
}
281-
err = c.Get(context.TODO(), types.NamespacedName{
279+
err = c.Get(ctx, types.NamespacedName{
282280
Name: names.DefaultOperatorConfigName, Namespace: names.Namespace}, config)
283281

284282
if err != nil {
285283
if errors.IsNotFound(err) {
286284
logger.Info("Create default OperatorConfig")
287285
config.Namespace = names.Namespace
288286
config.Name = names.DefaultOperatorConfigName
289-
err = c.Create(context.TODO(), config)
287+
err = c.Create(ctx, config)
290288
if err != nil {
291289
return err
292290
}
@@ -322,28 +320,55 @@ func fetchTLSProfile(cfg *rest.Config) (configv1.TLSProfileSpec, error) {
322320
return openshifttls.FetchAPIServerTLSProfile(context.TODO(), c)
323321
}
324322

325-
// waitForWebhookServer waits until the webhook server is ready.
326-
func waitForWebhookServer(checker func(req *http.Request) error) error {
323+
// waitForWebhookServer waits until the local webhook server is listening and
324+
// the webhook-service is reachable via the cluster DNS. The latter is necessary
325+
// because the Kubernetes endpoint controller populates the Service endpoints
326+
// asynchronously; without this check, the API server may reject webhook calls
327+
// with "no endpoints available".
328+
func waitForWebhookServer(ctx context.Context, checker func(req *http.Request) error) error {
327329
const (
328-
timeout = 30 * time.Second // Adjust timeout as needed
329-
pollingFreq = 1 * time.Second // Polling frequency
330+
timeout = 60 * time.Second
331+
pollingFreq = 1 * time.Second
332+
dialTimeout = 2 * time.Second
330333
)
331334
start := time.Now()
335+
webhookServiceAddr := fmt.Sprintf("webhook-service.%s.svc:%d", names.Namespace, 443)
332336

333-
// Create an HTTP request to check the readiness of the webhook server.
334-
req, err := http.NewRequest("GET", "https://localhost:9443/healthz", nil)
337+
req, err := http.NewRequestWithContext(ctx, "GET", "https://localhost:9443/healthz", nil)
335338
if err != nil {
336339
return err
337340
}
338341

339-
// Poll the checker function until it returns nil (indicating success)
340-
// or until the timeout is reached.
341342
for {
342343
if err = checker(req); err == nil {
343-
return nil
344-
} else if time.Since(start) > timeout {
344+
break
345+
}
346+
if time.Since(start) > timeout {
345347
return fmt.Errorf("timeout waiting for webhook server to start")
346348
}
347-
time.Sleep(pollingFreq) // Poll every second
349+
select {
350+
case <-ctx.Done():
351+
return ctx.Err()
352+
case <-time.After(pollingFreq):
353+
}
354+
}
355+
356+
setupLog.Info("webhook server started, waiting for service endpoints")
357+
358+
for {
359+
conn, err := (&net.Dialer{Timeout: dialTimeout}).DialContext(ctx, "tcp", webhookServiceAddr)
360+
if err == nil {
361+
conn.Close()
362+
setupLog.Info("webhook service endpoints are ready")
363+
return nil
364+
}
365+
if time.Since(start) > timeout {
366+
return fmt.Errorf("timeout waiting for webhook service endpoints to be ready")
367+
}
368+
select {
369+
case <-ctx.Done():
370+
return ctx.Err()
371+
case <-time.After(pollingFreq):
372+
}
348373
}
349374
}

0 commit comments

Comments
 (0)