Skip to content

Commit 4a7eb59

Browse files
DavidS-ovmactions-user
authored andcommitted
[ENG-3089] Relax bbolt cache durability to fix goroutine pileup (#4221)
## Summary - Skip `fdatasync` on every bbolt commit in the sdpcache layer by setting `NoSync: true` and `NoFreelistSync: true` - Eliminates the single-writer bottleneck that cascaded into 36K stuck goroutines under load (bbolt serializes write transactions behind fdatasync, ~930 pool workers were queuing on the write lock) - Safe because sdpcache is a pure cache -- crash durability provides no value ## Linear Ticket - **Ticket**: [ENG-3089](https://linear.app/overmind/issue/ENG-3089/relax-bbolt-cache-durability-to-fix-goroutine-pileup-under-load) — Relax bbolt cache durability to fix goroutine pileup under load - **Purpose**: Remove the fdatasync bottleneck that causes cascading goroutine pileups when sources are under heavy query load - **Priority**: Urgent ## Changes Single file change in `go/sdpcache/bolt_cache.go`: 1. Added a package-level `cacheOpenOptions` variable with `NoSync: true`, `NoFreelistSync: true`, and the existing `Timeout: 5s` 2. Replaced all 7 inline `&bbolt.Options{Timeout: 5 * time.Second}` with `cacheOpenOptions` All sdpcache tests pass. No API or behavioral change -- only the fsync guarantee is relaxed, which is irrelevant for a cache. Made with [Cursor](https://cursor.com) <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Disables bbolt fsync and freelist syncing for the cache DB, which can improve throughput but increases the chance of cache corruption/loss after crashes or power failures. > > **Overview** > Introduces a shared `cacheOpenOptions` for all `bbolt.Open` calls in `sdpcache` that sets `NoSync` and `NoFreelistSync` (keeping the existing 5s `Timeout`) to avoid per-commit `fdatasync` overhead. > > All cache DB open/reopen paths (startup, deletion recovery, and compaction temp DB creation/reopen) are updated to use these relaxed durability settings. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 60297db5a8b4e78b25a685c30ad8242f84ed17b1. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> GitOrigin-RevId: 4de3a30393489e86e14f68b16c8bbd81b8f1f3da
1 parent 1594265 commit 4a7eb59

1 file changed

Lines changed: 17 additions & 9 deletions

File tree

go/sdpcache/bolt_cache.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@ var (
3131
deletedBytesKey = []byte("deletedBytes")
3232
)
3333

34+
// cacheOpenOptions are the bbolt options used for every Open call in this
35+
// package. Since this is a cache layer, crash durability is unnecessary:
36+
// - NoSync skips fdatasync per commit, removing the single-writer bottleneck.
37+
// - NoFreelistSync skips persisting the freelist, reducing write amplification.
38+
var cacheOpenOptions = &bbolt.Options{
39+
Timeout: 5 * time.Second,
40+
NoSync: true,
41+
NoFreelistSync: true,
42+
}
43+
3444
// DefaultCompactThreshold is the default threshold for triggering compaction (100MB)
3545
const DefaultCompactThreshold = 100 * 1024 * 1024
3646

@@ -234,9 +244,7 @@ func NewBoltCache(path string, opts ...BoltCacheOption) (*BoltCache, error) {
234244
}
235245

236246
// bbolt.Open will open an existing file if present, or create a new one
237-
db, err := bbolt.Open(path, 0o600, &bbolt.Options{
238-
Timeout: 5 * time.Second,
239-
})
247+
db, err := bbolt.Open(path, 0o600, cacheOpenOptions)
240248
if err != nil {
241249
return nil, fmt.Errorf("failed to open bolt database: %w", err)
242250
}
@@ -457,7 +465,7 @@ func (c *BoltCache) deleteCacheFileLocked(ctx context.Context, span trace.Span)
457465
c.resetDeletedBytes()
458466

459467
// Reopen the database
460-
db, err := bbolt.Open(c.path, 0o600, &bbolt.Options{Timeout: 5 * time.Second})
468+
db, err := bbolt.Open(c.path, 0o600, cacheOpenOptions)
461469
if err != nil {
462470
span.RecordError(err)
463471
span.SetStatus(codes.Error, "failed to reopen database")
@@ -1341,13 +1349,13 @@ func (c *BoltCache) compact(ctx context.Context) error {
13411349
}
13421350

13431351
// Open the destination database
1344-
dstDB, err := bbolt.Open(tempPath, 0o600, &bbolt.Options{Timeout: 5 * time.Second})
1352+
dstDB, err := bbolt.Open(tempPath, 0o600, cacheOpenOptions)
13451353
if err != nil {
13461354
if isDiskFullError(err) {
13471355
// Attempt cleanup first - use locked version since we already hold the lock
13481356
c.purgeLocked(ctx, time.Now())
13491357
// Try again
1350-
dstDB, err = bbolt.Open(tempPath, 0o600, &bbolt.Options{Timeout: 5 * time.Second})
1358+
dstDB, err = bbolt.Open(tempPath, 0o600, cacheOpenOptions)
13511359
if err != nil {
13521360
return handleDiskFull(err, "temp database creation")
13531361
}
@@ -1364,7 +1372,7 @@ func (c *BoltCache) compact(ctx context.Context) error {
13641372
// Attempt cleanup first - use locked version since we already hold the lock
13651373
c.purgeLocked(ctx, time.Now())
13661374
// Try compaction again
1367-
dstDB2, retryErr := bbolt.Open(tempPath, 0o600, &bbolt.Options{Timeout: 5 * time.Second})
1375+
dstDB2, retryErr := bbolt.Open(tempPath, 0o600, cacheOpenOptions)
13681376
if retryErr != nil {
13691377
return handleDiskFull(retryErr, "temp database creation after cleanup")
13701378
}
@@ -1395,12 +1403,12 @@ func (c *BoltCache) compact(ctx context.Context) error {
13951403
// Replace the old file with the compacted one
13961404
if err := os.Rename(tempPath, c.path); err != nil {
13971405
// Try to reopen the original database
1398-
c.db, _ = bbolt.Open(c.path, 0o600, &bbolt.Options{Timeout: 5 * time.Second})
1406+
c.db, _ = bbolt.Open(c.path, 0o600, cacheOpenOptions)
13991407
return handleDiskFull(err, "rename")
14001408
}
14011409

14021410
// Reopen the database
1403-
db, err := bbolt.Open(c.path, 0o600, &bbolt.Options{Timeout: 5 * time.Second})
1411+
db, err := bbolt.Open(c.path, 0o600, cacheOpenOptions)
14041412
if err != nil {
14051413
span.RecordError(err)
14061414
span.SetStatus(codes.Error, "failed to reopen database")

0 commit comments

Comments
 (0)