@@ -56,8 +56,13 @@ class BaseCache:
56
56
57
57
def __init__ (self , blocksize : int , fetcher : Fetcher , size : int ) -> None :
58
58
self .blocksize = blocksize
59
+ self .nblocks = 0
59
60
self .fetcher = fetcher
60
61
self .size = size
62
+ self .hit_count = 0
63
+ self .miss_count = 0
64
+ # the bytes that we actually requested
65
+ self .total_requested_bytes = 0
61
66
62
67
def _fetch (self , start : int | None , stop : int | None ) -> bytes :
63
68
if start is None :
@@ -68,6 +73,36 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
68
73
return b""
69
74
return self .fetcher (start , stop )
70
75
76
+ def _reset_stats (self ) -> None :
77
+ """Reset hit and miss counts for a more ganular report e.g. by file."""
78
+ self .hit_count = 0
79
+ self .miss_count = 0
80
+ self .total_requested_bytes = 0
81
+
82
+ def _log_stats (self ) -> str :
83
+ """Return a formatted string of the cache statistics."""
84
+ if self .hit_count == 0 and self .miss_count == 0 :
85
+ # a cache that does nothing, this is for logs only
86
+ return ""
87
+ return " , %s: %d hits, %d misses, %d total requested bytes" % (
88
+ self .name ,
89
+ self .hit_count ,
90
+ self .miss_count ,
91
+ self .total_requested_bytes ,
92
+ )
93
+
94
+ def __repr__ (self ) -> str :
95
+ # TODO: use rich for better formatting
96
+ return f"""
97
+ <{ self .__class__ .__name__ } :
98
+ block size : { self .blocksize }
99
+ block count : { self .nblocks }
100
+ file size : { self .size }
101
+ cache hits : { self .hit_count }
102
+ cache misses: { self .miss_count }
103
+ total requested bytes: { self .total_requested_bytes } >
104
+ """
105
+
71
106
72
107
class MMapCache (BaseCache ):
73
108
"""memory-mapped sparse file cache
@@ -126,13 +161,18 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
126
161
start_block = start // self .blocksize
127
162
end_block = end // self .blocksize
128
163
need = [i for i in range (start_block , end_block + 1 ) if i not in self .blocks ]
164
+ hits = [i for i in range (start_block , end_block + 1 ) if i in self .blocks ]
165
+ self .miss_count += len (need )
166
+ self .hit_count += len (hits )
129
167
while need :
130
168
# TODO: not a for loop so we can consolidate blocks later to
131
169
# make fewer fetch calls; this could be parallel
132
170
i = need .pop (0 )
171
+
133
172
sstart = i * self .blocksize
134
173
send = min (sstart + self .blocksize , self .size )
135
- logger .debug (f"MMap get block #{ i } ({ sstart } -{ send } " )
174
+ self .total_requested_bytes += send - sstart
175
+ logger .debug (f"MMap get block #{ i } ({ sstart } -{ send } )" )
136
176
self .cache [sstart :send ] = self .fetcher (sstart , send )
137
177
self .blocks .add (i )
138
178
@@ -176,16 +216,20 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
176
216
l = end - start
177
217
if start >= self .start and end <= self .end :
178
218
# cache hit
219
+ self .hit_count += 1
179
220
return self .cache [start - self .start : end - self .start ]
180
221
elif self .start <= start < self .end :
181
222
# partial hit
223
+ self .miss_count += 1
182
224
part = self .cache [start - self .start :]
183
225
l -= len (part )
184
226
start = self .end
185
227
else :
186
228
# miss
229
+ self .miss_count += 1
187
230
part = b""
188
231
end = min (self .size , end + self .blocksize )
232
+ self .total_requested_bytes += end - start
189
233
self .cache = self .fetcher (start , end ) # new block replaces old
190
234
self .start = start
191
235
self .end = self .start + len (self .cache )
@@ -202,24 +246,39 @@ class FirstChunkCache(BaseCache):
202
246
name = "first"
203
247
204
248
def __init__ (self , blocksize : int , fetcher : Fetcher , size : int ) -> None :
249
+ if blocksize > size :
250
+ # this will buffer the whole thing
251
+ blocksize = size
205
252
super ().__init__ (blocksize , fetcher , size )
206
253
self .cache : bytes | None = None
207
254
208
255
def _fetch (self , start : int | None , end : int | None ) -> bytes :
209
256
start = start or 0
210
- end = end or self .size
257
+ if start > self .size :
258
+ logger .debug ("FirstChunkCache: requested start > file size" )
259
+ return b""
260
+
261
+ end = min (end , self .size )
262
+
211
263
if start < self .blocksize :
212
264
if self .cache is None :
265
+ self .miss_count += 1
213
266
if end > self .blocksize :
267
+ self .total_requested_bytes += end
214
268
data = self .fetcher (0 , end )
215
269
self .cache = data [: self .blocksize ]
216
270
return data [start :]
217
271
self .cache = self .fetcher (0 , self .blocksize )
272
+ self .total_requested_bytes += self .blocksize
218
273
part = self .cache [start :end ]
219
274
if end > self .blocksize :
275
+ self .total_requested_bytes += end - self .blocksize
220
276
part += self .fetcher (self .blocksize , end )
277
+ self .hit_count += 1
221
278
return part
222
279
else :
280
+ self .miss_count += 1
281
+ self .total_requested_bytes += end - start
223
282
return self .fetcher (start , end )
224
283
225
284
@@ -256,12 +315,6 @@ def __init__(
256
315
self .maxblocks = maxblocks
257
316
self ._fetch_block_cached = functools .lru_cache (maxblocks )(self ._fetch_block )
258
317
259
- def __repr__ (self ) -> str :
260
- return (
261
- f"<BlockCache blocksize={ self .blocksize } , "
262
- f"size={ self .size } , nblocks={ self .nblocks } >"
263
- )
264
-
265
318
def cache_info (self ):
266
319
"""
267
320
The statistics on the block cache.
@@ -319,6 +372,8 @@ def _fetch_block(self, block_number: int) -> bytes:
319
372
320
373
start = block_number * self .blocksize
321
374
end = start + self .blocksize
375
+ self .total_requested_bytes += end - start
376
+ self .miss_count += 1
322
377
logger .info ("BlockCache fetching block %d" , block_number )
323
378
block_contents = super ()._fetch (start , end )
324
379
return block_contents
@@ -339,6 +394,7 @@ def _read_cache(
339
394
start_pos = start % self .blocksize
340
395
end_pos = end % self .blocksize
341
396
397
+ self .hit_count += 1
342
398
if start_block_number == end_block_number :
343
399
block : bytes = self ._fetch_block_cached (start_block_number )
344
400
return block [start_pos :end_pos ]
@@ -404,6 +460,7 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
404
460
):
405
461
# cache hit: we have all the required data
406
462
offset = start - self .start
463
+ self .hit_count += 1
407
464
return self .cache [offset : offset + end - start ]
408
465
409
466
if self .blocksize :
@@ -418,27 +475,34 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
418
475
self .end is None or end > self .end
419
476
):
420
477
# First read, or extending both before and after
478
+ self .total_requested_bytes += bend - start
479
+ self .miss_count += 1
421
480
self .cache = self .fetcher (start , bend )
422
481
self .start = start
423
482
else :
424
483
assert self .start is not None
425
484
assert self .end is not None
485
+ self .miss_count += 1
426
486
427
487
if start < self .start :
428
488
if self .end is None or self .end - end > self .blocksize :
489
+ self .total_requested_bytes += bend - start
429
490
self .cache = self .fetcher (start , bend )
430
491
self .start = start
431
492
else :
493
+ self .total_requested_bytes += self .start - start
432
494
new = self .fetcher (start , self .start )
433
495
self .start = start
434
496
self .cache = new + self .cache
435
497
elif self .end is not None and bend > self .end :
436
498
if self .end > self .size :
437
499
pass
438
500
elif end - self .end > self .blocksize :
501
+ self .total_requested_bytes += bend - start
439
502
self .cache = self .fetcher (start , bend )
440
503
self .start = start
441
504
else :
505
+ self .total_requested_bytes += bend - self .end
442
506
new = self .fetcher (self .end , bend )
443
507
self .cache = self .cache + new
444
508
@@ -470,10 +534,13 @@ def __init__(
470
534
) -> None :
471
535
super ().__init__ (blocksize , fetcher , size ) # type: ignore[arg-type]
472
536
if data is None :
537
+ self .miss_count += 1
538
+ self .total_requested_bytes += self .size
473
539
data = self .fetcher (0 , self .size )
474
540
self .data = data
475
541
476
542
def _fetch (self , start : int | None , stop : int | None ) -> bytes :
543
+ self .hit_count += 1
477
544
return self .data [start :stop ]
478
545
479
546
@@ -551,6 +618,7 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
551
618
# are allowed to pad reads beyond the
552
619
# buffer with zero
553
620
out += b"\x00 " * (stop - start - len (out ))
621
+ self .hit_count += 1
554
622
return out
555
623
else :
556
624
# The request ends outside a known range,
@@ -572,6 +640,8 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
572
640
f"IO/caching performance may be poor!"
573
641
)
574
642
logger .debug (f"KnownPartsOfAFile cache fetching { start } -{ stop } " )
643
+ self .total_requested_bytes += stop - start
644
+ self .miss_count += 1
575
645
return out + super ()._fetch (start , stop )
576
646
577
647
@@ -676,12 +746,6 @@ def __init__(
676
746
self ._fetch_future : Future [bytes ] | None = None
677
747
self ._fetch_future_lock = threading .Lock ()
678
748
679
- def __repr__ (self ) -> str :
680
- return (
681
- f"<BackgroundBlockCache blocksize={ self .blocksize } , "
682
- f"size={ self .size } , nblocks={ self .nblocks } >"
683
- )
684
-
685
749
def cache_info (self ) -> UpdatableLRU .CacheInfo :
686
750
"""
687
751
The statistics on the block cache.
@@ -799,6 +863,8 @@ def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
799
863
start = block_number * self .blocksize
800
864
end = start + self .blocksize
801
865
logger .info ("BlockCache fetching block (%s) %d" , log_info , block_number )
866
+ self .total_requested_bytes += end - start
867
+ self .miss_count += 1
802
868
block_contents = super ()._fetch (start , end )
803
869
return block_contents
804
870
@@ -818,6 +884,9 @@ def _read_cache(
818
884
start_pos = start % self .blocksize
819
885
end_pos = end % self .blocksize
820
886
887
+ # kind of pointless to count this as a hit, but it is
888
+ self .hit_count += 1
889
+
821
890
if start_block_number == end_block_number :
822
891
block = self ._fetch_block_cached (start_block_number )
823
892
return block [start_pos :end_pos ]
0 commit comments