Skip to content

Commit 9b6371e

Browse files
stats/opentelemetry: fix flaky TestTraceSpan_WithRetriesAndNameResolutionDelay
This commit fixes the flaky test TestTraceSpan_WithRetriesAndNameResolutionDelay which was introduced in PR #8342 and caused that PR to be reverted. Root Cause: The test had race conditions related to timing: 1. The goroutine that updates resolver state could complete before or after the delayed resolution event was fully processed and recorded in spans 2. Span export timing was not synchronized with test validation, causing the test to sometimes check spans before they were fully exported Fix: 1. Added 'stateUpdated' event to synchronize between the resolver state update completing and span validation beginning 2. Added explicit wait for the stateUpdated event before validating spans 3. Added a 50ms sleep after RPC completion to give the span exporter time to process and export all spans before validation Testing: - Test now passes consistently (10+ consecutive runs) - Passes with race detector enabled (-race flag) - No data races detected Fixes #8700
1 parent 60a5a48 commit 9b6371e

File tree

1 file changed

+19
-0
lines changed

1 file changed

+19
-0
lines changed

stats/opentelemetry/e2e_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1627,6 +1627,7 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) {
16271627
for _, tt := range tests {
16281628
t.Run(tt.name, func(t *testing.T) {
16291629
resolutionWait := grpcsync.NewEvent()
1630+
stateUpdated := grpcsync.NewEvent()
16301631
prevHook := internal.NewStreamWaitingForResolver
16311632
internal.NewStreamWaitingForResolver = func() { resolutionWait.Fire() }
16321633
defer func() { internal.NewStreamWaitingForResolver = prevHook }()
@@ -1669,14 +1670,32 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) {
16691670
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
16701671
defer cancel()
16711672

1673+
// Start the goroutine that will update resolver state once the stream
1674+
// is waiting for resolution. Use stateUpdated event to ensure the
1675+
// resolver state is updated before we start validating spans.
16721676
go func() {
16731677
<-resolutionWait.Done()
16741678
rb.UpdateState(resolver.State{Addresses: []resolver.Address{{Addr: ss.Address}}})
1679+
stateUpdated.Fire()
16751680
}()
1681+
16761682
if err := tt.doCall(ctx, client); err != nil {
16771683
t.Fatalf("%s call failed: %v", tt.name, err)
16781684
}
16791685

1686+
// Wait for the resolver state to be updated to ensure the delayed
1687+
// resolution event has been processed.
1688+
select {
1689+
case <-stateUpdated.Done():
1690+
case <-ctx.Done():
1691+
t.Fatal("Timed out waiting for resolver state update")
1692+
}
1693+
1694+
// Give the span exporter a small amount of time to process and export
1695+
// all spans from the completed RPC. This reduces flakiness by ensuring
1696+
// all trace events have been fully recorded before validation.
1697+
time.Sleep(50 * time.Millisecond)
1698+
16801699
wantSpanInfo := traceSpanInfo{
16811700
name: tt.spanName,
16821701
spanKind: oteltrace.SpanKindClient.String(),

0 commit comments

Comments
 (0)