Skip to content

Commit 4d88848

Browse files
DavidS-ovmtphoney
authored andcommitted
fix: always attach goroutine profile on stuck waitgroup detection (#4185)
singleflight.Group.Do returns shared=true for ALL callers when multiple hit concurrently — including the original. The previous !profileShared gate meant no caller ever stored the profile in the thundering-herd scenario, which is exactly when the diagnostic data is most needed. Removed the shared return value from captureGoroutineSummary since the singleflight still deduplicates the expensive pprof capture itself. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Low Risk** > Low risk diagnostic change that only affects stuck waitgroup tracing/logging paths, with a small potential increase in trace payload size. > > **Overview** > When `ExecuteQuery` detects a cancelled context and the adapter waitgroup remains stuck, the `waitgroup.stuck` span event now **always** includes the compacted goroutine pprof summary. > > This simplifies `captureGoroutineSummary` to return only the shared profile string (still singleflight-deduped) and removes the prior conditional that could omit the profile under concurrent callers; it also makes minor formatting/cleanup changes (e.g., grouping atomic counters). > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit ddd2cddfb4d1c66003535bc296c4b11e9cb377e1. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> GitOrigin-RevId: db976f32f2cae318e5f8f5637e850e3de21b9440
1 parent 749afad commit 4d88848

1 file changed

Lines changed: 12 additions & 14 deletions

File tree

go/discovery/enginerequests.go

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -187,10 +187,9 @@ func compactGoroutineProfile(s string) string {
187187
// captureGoroutineSummary returns a compacted goroutine profile (pprof debug=1)
188188
// truncated to maxBytes, deduplicated via singleflight. When many ExecuteQuery
189189
// goroutines hit the stuck timeout simultaneously, only one runs the
190-
// (stop-the-world) pprof capture; the rest share its result. The shared return
191-
// value indicates whether this caller reused another's capture.
192-
func captureGoroutineSummary(maxBytes int) (profile string, shared bool) {
193-
v, _, shared := goroutineProfileGroup.Do("goroutine-profile", func() (any, error) {
190+
// (stop-the-world) pprof capture; the rest share its result.
191+
func captureGoroutineSummary(maxBytes int) string {
192+
v, _, _ := goroutineProfileGroup.Do("goroutine-profile", func() (any, error) {
194193
var buf bytes.Buffer
195194
_ = pprof.Lookup("goroutine").WriteTo(&buf, 1)
196195
s := compactGoroutineProfile(buf.String())
@@ -199,11 +198,13 @@ func captureGoroutineSummary(maxBytes int) (profile string, shared bool) {
199198
}
200199
return s, nil
201200
})
202-
return v.(string), shared
201+
return v.(string)
203202
}
204203

205-
var listExecutionPoolCount atomic.Int32
206-
var getExecutionPoolCount atomic.Int32
204+
var (
205+
listExecutionPoolCount atomic.Int32
206+
getExecutionPoolCount atomic.Int32
207+
)
207208

208209
// ExecuteQuery Executes a single Query and returns the results without any
209210
// linking. Will return an error if the Query couldn't be run.
@@ -342,17 +343,14 @@ func (e *Engine) ExecuteQuery(ctx context.Context, query *sdp.Query, responses c
342343
return
343344
case <-time.After(longRunningAdaptersTimeout):
344345
// If we're here, then the wait group didn't finish in time
345-
goroutineSummary, profileShared := captureGoroutineSummary(48_000)
346+
goroutineSummary := captureGoroutineSummary(48_000)
346347
expandedMutex.RLock()
347-
stuckAttrs := []attribute.KeyValue{
348+
span.AddEvent("waitgroup.stuck", trace.WithAttributes(
348349
attribute.Int("ovm.stuck.goroutineCount", runtime.NumGoroutine()),
349350
attribute.Int("ovm.stuck.totalQueries", totalQueries),
350351
attribute.Int("ovm.stuck.remainingQueries", len(expanded)),
351-
}
352-
if !profileShared {
353-
stuckAttrs = append(stuckAttrs, attribute.String("ovm.stuck.goroutineProfile", goroutineSummary))
354-
}
355-
span.AddEvent("waitgroup.stuck", trace.WithAttributes(stuckAttrs...))
352+
attribute.String("ovm.stuck.goroutineProfile", goroutineSummary),
353+
))
356354
for q, adapter := range expanded {
357355
span.AddEvent("waitgroup.stuck.adapter", trace.WithAttributes(
358356
attribute.String("ovm.stuck.adapter", adapter.Name()),

0 commit comments

Comments
 (0)