Skip to content

Commit 9eb7483

Browse files
committed
Adding DiskCache stats
1 parent b994285 commit 9eb7483

File tree

9 files changed

+381
-120
lines changed

9 files changed

+381
-120
lines changed

go/pkg/client/client.go

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"time"
1616

1717
"errors"
18+
1819
"github.com/bazelbuild/remote-apis-sdks/go/pkg/actas"
1920
"github.com/bazelbuild/remote-apis-sdks/go/pkg/balancer"
2021
"github.com/bazelbuild/remote-apis-sdks/go/pkg/chunker"
@@ -244,6 +245,10 @@ func (c *Client) Close() error {
244245
if c.casConnection != c.connection {
245246
return c.casConnection.Close()
246247
}
248+
if c.diskCache != nil {
249+
// Waits for local disk GC to complete.
250+
c.diskCache.Shutdown()
251+
}
247252
return nil
248253
}
249254

@@ -354,21 +359,12 @@ func (o *TreeSymlinkOpts) Apply(c *Client) {
354359
}
355360

356361
type DiskCacheOpts struct {
357-
Context context.Context
358-
Path string
359-
MaxCapacityGb float64
362+
DiskCache *diskcache.DiskCache
360363
}
361364

362365
// Apply sets the client's TreeSymlinkOpts.
363366
func (o *DiskCacheOpts) Apply(c *Client) {
364-
if o.Path != "" {
365-
capBytes := uint64(o.MaxCapacityGb * 1024 * 1024 * 1024)
366-
var err error
367-
// TODO(ola): propagate errors from Apply.
368-
if c.diskCache, err = diskcache.New(o.Context, o.Path, capBytes); err != nil {
369-
log.Errorf("Error initializing disk cache on %s: %v", o.Path, err)
370-
}
371-
}
367+
c.diskCache = o.DiskCache
372368
}
373369

374370
// MaxBatchDigests is maximum amount of digests to batch in upload and download operations.

go/pkg/diskcache/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ go_test(
2828
"//go/pkg/testutil",
2929
"@com_github_bazelbuild_remote_apis//build/bazel/remote/execution/v2:remote_execution_go_proto",
3030
"@com_github_google_go_cmp//cmp:go_default_library",
31-
"@com_github_pborman_uuid//:go_default_library",
31+
"@com_github_google_uuid//:uuid",
3232
"@org_golang_x_sync//errgroup:go_default_library",
3333
],
3434
)

go/pkg/diskcache/diskcache.go

Lines changed: 94 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,28 @@ type DiskCache struct {
9797
mu sync.Mutex // protects the queue.
9898
store sync.Map // map of keys to qitems.
9999
queue *priorityQueue // keys by last accessed time.
100-
sizeBytes int64 // total size.
101100
ctx context.Context
102101
shutdown chan bool
102+
shutdownOnce sync.Once
103103
gcTick uint64
104104
gcReq chan uint64
105-
testGcTicks chan uint64
105+
gcDone chan bool
106+
stats *DiskCacheStats
107+
}
108+
109+
type DiskCacheStats struct {
110+
TotalSizeBytes int64
111+
TotalNumFiles int64
112+
NumFilesStored int64
113+
TotalStoredBytes int64
114+
NumFilesGCed int64
115+
TotalGCedSizeBytes int64
116+
NumCacheHits int64
117+
NumCacheMisses int64
118+
TotalCacheHitSizeBytes int64
119+
InitTime time.Duration
120+
TotalGCTime time.Duration
121+
TotalGCDiskOpsTime time.Duration
106122
}
107123

108124
func New(ctx context.Context, root string, maxCapacityBytes uint64) (*DiskCache, error) {
@@ -115,7 +131,11 @@ func New(ctx context.Context, root string, maxCapacityBytes uint64) (*DiskCache,
115131
},
116132
gcReq: make(chan uint64, maxConcurrentRequests),
117133
shutdown: make(chan bool),
134+
gcDone: make(chan bool),
135+
stats: &DiskCacheStats{},
118136
}
137+
start := time.Now()
138+
defer func() { atomic.AddInt64((*int64)(&res.stats.InitTime), int64(time.Since(start))) }()
119139
heap.Init(res.queue)
120140
if err := os.MkdirAll(root, os.ModePerm); err != nil {
121141
return nil, err
@@ -135,30 +155,31 @@ func New(ctx context.Context, root string, maxCapacityBytes uint64) (*DiskCache,
135155
return filepath.WalkDir(prefixDir, func(path string, d fs.DirEntry, err error) error {
136156
// We log and continue on all errors, because cache read errors are not critical.
137157
if err != nil {
138-
return fmt.Errorf("error reading cache directory: %v", err)
158+
return fmt.Errorf("error reading cache directory: %w", err)
139159
}
140160
if d.IsDir() {
141161
return nil
142162
}
143163
subdir := filepath.Base(filepath.Dir(path))
144-
k, err := res.getKeyFromFileName(subdir + d.Name())
164+
k, err := getKeyFromFileName(subdir + d.Name())
145165
if err != nil {
146-
return fmt.Errorf("error parsing cached file name %s: %v", path, err)
166+
return fmt.Errorf("error parsing cached file name %s: %w", path, err)
147167
}
148-
atime, err := getLastAccessTime(path)
168+
info, err := d.Info()
149169
if err != nil {
150-
return fmt.Errorf("error getting last accessed time of %s: %v", path, err)
170+
return fmt.Errorf("error getting file info of %s: %w", path, err)
151171
}
152172
it := &qitem{
153173
key: k,
154-
lat: atime,
174+
lat: fileInfoToAccessTime(info),
155175
}
156176
size, err := res.getItemSize(k)
157177
if err != nil {
158-
return fmt.Errorf("error getting file size of %s: %v", path, err)
178+
return fmt.Errorf("error getting file size of %s: %w", path, err)
159179
}
160180
res.store.Store(k, it)
161-
atomic.AddInt64(&res.sizeBytes, size)
181+
atomic.AddInt64(&res.stats.TotalSizeBytes, size)
182+
atomic.AddInt64(&res.stats.TotalNumFiles, 1)
162183
res.mu.Lock()
163184
heap.Push(res.queue, it)
164185
res.mu.Unlock()
@@ -180,42 +201,50 @@ func (d *DiskCache) getItemSize(k key) (int64, error) {
180201
fname := d.getPath(k)
181202
info, err := os.Stat(fname)
182203
if err != nil {
183-
return 0, fmt.Errorf("Error getting info for %s: %v", fname, err)
204+
return 0, fmt.Errorf("error getting info for %s: %w", fname, err)
184205
}
185206
return info.Size(), nil
186207
}
187208

188-
// Releases resources and terminates the GC daemon. Should be the last call to the DiskCache.
209+
// Terminates the GC daemon, waiting for it to complete. No further Store* calls to the DiskCache should be made.
189210
func (d *DiskCache) Shutdown() {
190-
d.shutdown <- true
191-
}
192-
193-
func (d *DiskCache) TotalSizeBytes() uint64 {
194-
return uint64(atomic.LoadInt64(&d.sizeBytes))
211+
d.shutdownOnce.Do(func() {
212+
d.shutdown <- true
213+
<-d.gcDone
214+
log.Infof("DiskCacheStats: %+v", d.stats)
215+
})
195216
}
196217

197-
// This function is defined in https://pkg.go.dev/strings#CutSuffix
198-
// It is copy/pasted here as a hack, because I failed to upgrade the *Reclient* repo to the latest Go 1.20.7.
199-
func CutSuffix(s, suffix string) (before string, found bool) {
200-
if !strings.HasSuffix(s, suffix) {
201-
return s, false
218+
func (d *DiskCache) GetStats() *DiskCacheStats {
219+
// Return a copy for safety.
220+
return &DiskCacheStats{
221+
TotalSizeBytes: atomic.LoadInt64(&d.stats.TotalSizeBytes),
222+
TotalNumFiles: atomic.LoadInt64(&d.stats.TotalNumFiles),
223+
NumFilesStored: atomic.LoadInt64(&d.stats.NumFilesStored),
224+
TotalStoredBytes: atomic.LoadInt64(&d.stats.TotalStoredBytes),
225+
NumFilesGCed: atomic.LoadInt64(&d.stats.NumFilesGCed),
226+
TotalGCedSizeBytes: atomic.LoadInt64(&d.stats.TotalGCedSizeBytes),
227+
NumCacheHits: atomic.LoadInt64(&d.stats.NumCacheHits),
228+
NumCacheMisses: atomic.LoadInt64(&d.stats.NumCacheMisses),
229+
TotalCacheHitSizeBytes: atomic.LoadInt64(&d.stats.TotalCacheHitSizeBytes),
230+
InitTime: time.Duration(atomic.LoadInt64((*int64)(&d.stats.InitTime))),
231+
TotalGCTime: time.Duration(atomic.LoadInt64((*int64)(&d.stats.TotalGCTime))),
202232
}
203-
return s[:len(s)-len(suffix)], true
204233
}
205234

206-
func (d *DiskCache) getKeyFromFileName(fname string) (key, error) {
235+
func getKeyFromFileName(fname string) (key, error) {
207236
pair := strings.Split(fname, ".")
208237
if len(pair) != 2 {
209-
return key{}, fmt.Errorf("expected file name in the form [ac_]hash/size, got %s", fname)
238+
return key{}, fmt.Errorf("expected file name in the form hash[_ac].size, got %s", fname)
210239
}
211240
size, err := strconv.ParseInt(pair[1], 10, 64)
212241
if err != nil {
213242
return key{}, fmt.Errorf("invalid size in digest %s: %s", fname, err)
214243
}
215-
hash, isAc := CutSuffix(pair[0], "ac_")
244+
hash, isAc := strings.CutSuffix(pair[0], "_ac")
216245
dg, err := digest.New(hash, size)
217246
if err != nil {
218-
return key{}, fmt.Errorf("invalid digest from file name %s: %v", fname, err)
247+
return key{}, fmt.Errorf("invalid digest from file name %s: %w", fname, err)
219248
}
220249
return key{digest: dg, isCas: !isAc}, nil
221250
}
@@ -248,7 +277,10 @@ func (d *DiskCache) StoreCas(dg digest.Digest, path string) error {
248277
if err := copyFile(path, d.getPath(it.key), dg.Size); err != nil {
249278
return err
250279
}
251-
newSize := uint64(atomic.AddInt64(&d.sizeBytes, dg.Size))
280+
newSize := uint64(atomic.AddInt64(&d.stats.TotalSizeBytes, dg.Size))
281+
atomic.AddInt64(&d.stats.TotalNumFiles, 1)
282+
atomic.AddInt64(&d.stats.NumFilesStored, 1)
283+
atomic.AddInt64(&d.stats.TotalStoredBytes, dg.Size)
252284
if newSize > d.maxCapacityBytes {
253285
select {
254286
case d.gcReq <- atomic.AddUint64(&d.gcTick, 1):
@@ -281,7 +313,10 @@ func (d *DiskCache) StoreActionCache(dg digest.Digest, ar *repb.ActionResult) er
281313
if err := os.WriteFile(d.getPath(it.key), bytes, 0644); err != nil {
282314
return err
283315
}
284-
newSize := uint64(atomic.AddInt64(&d.sizeBytes, int64(size)))
316+
newSize := uint64(atomic.AddInt64(&d.stats.TotalSizeBytes, int64(size)))
317+
atomic.AddInt64(&d.stats.TotalNumFiles, 1)
318+
atomic.AddInt64(&d.stats.NumFilesStored, 1)
319+
atomic.AddInt64(&d.stats.TotalStoredBytes, int64(size))
285320
if newSize > d.maxCapacityBytes {
286321
select {
287322
case d.gcReq <- atomic.AddUint64(&d.gcTick, 1):
@@ -292,38 +327,40 @@ func (d *DiskCache) StoreActionCache(dg digest.Digest, ar *repb.ActionResult) er
292327
}
293328

294329
func (d *DiskCache) gc() {
330+
defer func() { d.gcDone <- true }()
295331
for {
296332
select {
297333
case <-d.shutdown:
298334
return
299335
case <-d.ctx.Done():
300336
return
301-
case t := <-d.gcReq:
337+
case <-d.gcReq:
338+
start := time.Now()
302339
// Evict old entries until total size is below cap.
303-
for uint64(atomic.LoadInt64(&d.sizeBytes)) > d.maxCapacityBytes {
340+
for uint64(atomic.LoadInt64(&d.stats.TotalSizeBytes)) > d.maxCapacityBytes {
304341
d.mu.Lock()
305342
it := heap.Pop(d.queue).(*qitem)
306343
d.mu.Unlock()
307344
size, err := d.getItemSize(it.key)
308345
if err != nil {
309-
log.Errorf("error getting item size for %v: %v", it.key, err)
346+
log.Errorf("error getting item size for %v: %w", it.key, err)
310347
size = 0
311348
}
312-
atomic.AddInt64(&d.sizeBytes, -size)
349+
atomic.AddInt64(&d.stats.TotalSizeBytes, -size)
350+
atomic.AddInt64(&d.stats.TotalNumFiles, -1)
351+
atomic.AddInt64(&d.stats.TotalGCedSizeBytes, size)
352+
atomic.AddInt64(&d.stats.NumFilesGCed, 1)
313353
it.mu.Lock()
354+
diskOpsStart := time.Now()
314355
// We only delete the files, and not the prefix directories, because the prefixes are not worth worrying about.
315356
if err := os.Remove(d.getPath(it.key)); err != nil {
316-
log.Errorf("Error removing file: %v", err)
357+
log.Errorf("Error removing file: %w", err)
317358
}
359+
atomic.AddInt64((*int64)(&d.stats.TotalGCDiskOpsTime), int64(time.Since(diskOpsStart)))
318360
d.store.Delete(it.key)
319361
it.mu.Unlock()
320362
}
321-
if d.testGcTicks != nil {
322-
select {
323-
case d.testGcTicks <- t:
324-
default:
325-
}
326-
}
363+
atomic.AddInt64((*int64)(&d.stats.TotalGCTime), int64(time.Since(start)))
327364
}
328365
}
329366
}
@@ -363,6 +400,7 @@ func (d *DiskCache) LoadCas(dg digest.Digest, path string) bool {
363400
k := key{digest: dg, isCas: true}
364401
iUntyped, loaded := d.store.Load(k)
365402
if !loaded {
403+
atomic.AddInt64(&d.stats.NumCacheMisses, 1)
366404
return false
367405
}
368406
it := iUntyped.(*qitem)
@@ -371,58 +409,59 @@ func (d *DiskCache) LoadCas(dg digest.Digest, path string) bool {
371409
it.mu.RUnlock()
372410
if err != nil {
373411
// It is not possible to prevent a race with GC; hence, we return false on copy errors.
412+
atomic.AddInt64(&d.stats.NumCacheMisses, 1)
374413
return false
375414
}
376415

377416
d.mu.Lock()
378417
d.queue.Bump(it)
379418
d.mu.Unlock()
419+
atomic.AddInt64(&d.stats.NumCacheHits, 1)
420+
atomic.AddInt64(&d.stats.TotalCacheHitSizeBytes, dg.Size)
380421
return true
381422
}
382423

383424
func (d *DiskCache) LoadActionCache(dg digest.Digest) (ar *repb.ActionResult, loaded bool) {
384425
k := key{digest: dg, isCas: false}
385426
iUntyped, loaded := d.store.Load(k)
386427
if !loaded {
428+
atomic.AddInt64(&d.stats.NumCacheMisses, 1)
387429
return nil, false
388430
}
389431
it := iUntyped.(*qitem)
390432
it.mu.RLock()
391433
ar = &repb.ActionResult{}
392-
if err := d.loadActionResult(k, ar); err != nil {
434+
size, err := d.loadActionResult(k, ar)
435+
if err != nil {
393436
// It is not possible to prevent a race with GC; hence, we return false on load errors.
394437
it.mu.RUnlock()
438+
atomic.AddInt64(&d.stats.NumCacheMisses, 1)
395439
return nil, false
396440
}
397441
it.mu.RUnlock()
398442

399443
d.mu.Lock()
400444
d.queue.Bump(it)
401445
d.mu.Unlock()
446+
atomic.AddInt64(&d.stats.NumCacheHits, 1)
447+
atomic.AddInt64(&d.stats.TotalCacheHitSizeBytes, int64(size))
402448
return ar, true
403449
}
404450

405-
func (d *DiskCache) loadActionResult(k key, ar *repb.ActionResult) error {
451+
func (d *DiskCache) loadActionResult(k key, ar *repb.ActionResult) (int, error) {
406452
bytes, err := os.ReadFile(d.getPath(k))
407453
if err != nil {
408-
return err
454+
return 0, err
409455
}
456+
n := len(bytes)
410457
// Required sanity check: sometimes the read pretends to succeed, but doesn't, if
411458
// the file is being concurrently deleted. Empty ActionResult is advised against in
412459
// the RE-API: https://github.com/bazelbuild/remote-apis/blob/main/build/bazel/remote/execution/v2/remote_execution.proto#L1052
413-
if len(bytes) == 0 {
414-
return fmt.Errorf("read empty ActionResult for %v", k.digest)
460+
if n == 0 {
461+
return n, fmt.Errorf("read empty ActionResult for %v", k.digest)
415462
}
416463
if err := proto.Unmarshal(bytes, ar); err != nil {
417-
return fmt.Errorf("error unmarshalling %v as ActionResult: %v", bytes, err)
418-
}
419-
return nil
420-
}
421-
422-
func getLastAccessTime(path string) (time.Time, error) {
423-
info, err := os.Stat(path)
424-
if err != nil {
425-
return time.Time{}, err
464+
return n, fmt.Errorf("error unmarshalling %v as ActionResult: %w", bytes, err)
426465
}
427-
return FileInfoToAccessTime(info), nil
466+
return n, nil
428467
}

0 commit comments

Comments
 (0)