diff --git a/changelog/sashaodessa_patch-1.md b/changelog/sashaodessa_patch-1.md new file mode 100644 index 000000000000..75533ae0123c --- /dev/null +++ b/changelog/sashaodessa_patch-1.md @@ -0,0 +1,3 @@ +### Ignored + +- Replace fixed sleep delays with active polling in prometheus service test to improve test reliability. diff --git a/monitoring/prometheus/service_test.go b/monitoring/prometheus/service_test.go index 3087ab3baa3a..569d28c3259b 100644 --- a/monitoring/prometheus/service_test.go +++ b/monitoring/prometheus/service_test.go @@ -26,8 +26,21 @@ func TestLifecycle(t *testing.T) { port := 1000 + rand.Intn(1000) prometheusService := NewService(t.Context(), fmt.Sprintf(":%d", port), nil) prometheusService.Start() - // Give service time to start. - time.Sleep(time.Second) + // Actively wait until the service responds on /metrics (faster and less flaky than a fixed sleep) + deadline := time.Now().Add(3 * time.Second) + for { + if time.Now().After(deadline) { + t.Fatalf("metrics endpoint not ready within timeout") + } + resp, err := http.Get(fmt.Sprintf("http://localhost:%d/metrics", port)) + if err == nil { + _ = resp.Body.Close() + if resp.StatusCode == http.StatusOK { + break + } + } + time.Sleep(50 * time.Millisecond) + } // Query the service to ensure it really started. resp, err := http.Get(fmt.Sprintf("http://localhost:%d/metrics", port)) @@ -36,8 +49,18 @@ func TestLifecycle(t *testing.T) { err = prometheusService.Stop() require.NoError(t, err) - // Give service time to stop. - time.Sleep(time.Second) + // Actively wait until the service stops responding on /metrics + deadline = time.Now().Add(3 * time.Second) + for { + if time.Now().After(deadline) { + t.Fatalf("metrics endpoint still reachable after timeout") + } + _, err = http.Get(fmt.Sprintf("http://localhost:%d/metrics", port)) + if err != nil { + break + } + time.Sleep(50 * time.Millisecond) + } // Query the service to ensure it really stopped. _, err = http.Get(fmt.Sprintf("http://localhost:%d/metrics", port))