Skip to content

Commit 05e7d80

Browse files
authored
Tracking cache requests (#1566)
1 parent 2dd9355 commit 05e7d80

File tree

3 files changed

+195
-28
lines changed

3 files changed

+195
-28
lines changed

fsspec/caching.py

+83-14
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,13 @@ class BaseCache:
5656

5757
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
5858
self.blocksize = blocksize
59+
self.nblocks = 0
5960
self.fetcher = fetcher
6061
self.size = size
62+
self.hit_count = 0
63+
self.miss_count = 0
64+
# the bytes that we actually requested
65+
self.total_requested_bytes = 0
6166

6267
def _fetch(self, start: int | None, stop: int | None) -> bytes:
6368
if start is None:
@@ -68,6 +73,36 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
6873
return b""
6974
return self.fetcher(start, stop)
7075

76+
def _reset_stats(self) -> None:
77+
"""Reset hit and miss counts for a more ganular report e.g. by file."""
78+
self.hit_count = 0
79+
self.miss_count = 0
80+
self.total_requested_bytes = 0
81+
82+
def _log_stats(self) -> str:
83+
"""Return a formatted string of the cache statistics."""
84+
if self.hit_count == 0 and self.miss_count == 0:
85+
# a cache that does nothing, this is for logs only
86+
return ""
87+
return " , %s: %d hits, %d misses, %d total requested bytes" % (
88+
self.name,
89+
self.hit_count,
90+
self.miss_count,
91+
self.total_requested_bytes,
92+
)
93+
94+
def __repr__(self) -> str:
95+
# TODO: use rich for better formatting
96+
return f"""
97+
<{self.__class__.__name__}:
98+
block size : {self.blocksize}
99+
block count : {self.nblocks}
100+
file size : {self.size}
101+
cache hits : {self.hit_count}
102+
cache misses: {self.miss_count}
103+
total requested bytes: {self.total_requested_bytes}>
104+
"""
105+
71106

72107
class MMapCache(BaseCache):
73108
"""memory-mapped sparse file cache
@@ -126,13 +161,18 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
126161
start_block = start // self.blocksize
127162
end_block = end // self.blocksize
128163
need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
164+
hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
165+
self.miss_count += len(need)
166+
self.hit_count += len(hits)
129167
while need:
130168
# TODO: not a for loop so we can consolidate blocks later to
131169
# make fewer fetch calls; this could be parallel
132170
i = need.pop(0)
171+
133172
sstart = i * self.blocksize
134173
send = min(sstart + self.blocksize, self.size)
135-
logger.debug(f"MMap get block #{i} ({sstart}-{send}")
174+
self.total_requested_bytes += send - sstart
175+
logger.debug(f"MMap get block #{i} ({sstart}-{send})")
136176
self.cache[sstart:send] = self.fetcher(sstart, send)
137177
self.blocks.add(i)
138178

@@ -176,16 +216,20 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
176216
l = end - start
177217
if start >= self.start and end <= self.end:
178218
# cache hit
219+
self.hit_count += 1
179220
return self.cache[start - self.start : end - self.start]
180221
elif self.start <= start < self.end:
181222
# partial hit
223+
self.miss_count += 1
182224
part = self.cache[start - self.start :]
183225
l -= len(part)
184226
start = self.end
185227
else:
186228
# miss
229+
self.miss_count += 1
187230
part = b""
188231
end = min(self.size, end + self.blocksize)
232+
self.total_requested_bytes += end - start
189233
self.cache = self.fetcher(start, end) # new block replaces old
190234
self.start = start
191235
self.end = self.start + len(self.cache)
@@ -202,24 +246,39 @@ class FirstChunkCache(BaseCache):
202246
name = "first"
203247

204248
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
249+
if blocksize > size:
250+
# this will buffer the whole thing
251+
blocksize = size
205252
super().__init__(blocksize, fetcher, size)
206253
self.cache: bytes | None = None
207254

208255
def _fetch(self, start: int | None, end: int | None) -> bytes:
209256
start = start or 0
210-
end = end or self.size
257+
if start > self.size:
258+
logger.debug("FirstChunkCache: requested start > file size")
259+
return b""
260+
261+
end = min(end, self.size)
262+
211263
if start < self.blocksize:
212264
if self.cache is None:
265+
self.miss_count += 1
213266
if end > self.blocksize:
267+
self.total_requested_bytes += end
214268
data = self.fetcher(0, end)
215269
self.cache = data[: self.blocksize]
216270
return data[start:]
217271
self.cache = self.fetcher(0, self.blocksize)
272+
self.total_requested_bytes += self.blocksize
218273
part = self.cache[start:end]
219274
if end > self.blocksize:
275+
self.total_requested_bytes += end - self.blocksize
220276
part += self.fetcher(self.blocksize, end)
277+
self.hit_count += 1
221278
return part
222279
else:
280+
self.miss_count += 1
281+
self.total_requested_bytes += end - start
223282
return self.fetcher(start, end)
224283

225284

@@ -256,12 +315,6 @@ def __init__(
256315
self.maxblocks = maxblocks
257316
self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
258317

259-
def __repr__(self) -> str:
260-
return (
261-
f"<BlockCache blocksize={self.blocksize}, "
262-
f"size={self.size}, nblocks={self.nblocks}>"
263-
)
264-
265318
def cache_info(self):
266319
"""
267320
The statistics on the block cache.
@@ -319,6 +372,8 @@ def _fetch_block(self, block_number: int) -> bytes:
319372

320373
start = block_number * self.blocksize
321374
end = start + self.blocksize
375+
self.total_requested_bytes += end - start
376+
self.miss_count += 1
322377
logger.info("BlockCache fetching block %d", block_number)
323378
block_contents = super()._fetch(start, end)
324379
return block_contents
@@ -339,6 +394,7 @@ def _read_cache(
339394
start_pos = start % self.blocksize
340395
end_pos = end % self.blocksize
341396

397+
self.hit_count += 1
342398
if start_block_number == end_block_number:
343399
block: bytes = self._fetch_block_cached(start_block_number)
344400
return block[start_pos:end_pos]
@@ -404,6 +460,7 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
404460
):
405461
# cache hit: we have all the required data
406462
offset = start - self.start
463+
self.hit_count += 1
407464
return self.cache[offset : offset + end - start]
408465

409466
if self.blocksize:
@@ -418,27 +475,34 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
418475
self.end is None or end > self.end
419476
):
420477
# First read, or extending both before and after
478+
self.total_requested_bytes += bend - start
479+
self.miss_count += 1
421480
self.cache = self.fetcher(start, bend)
422481
self.start = start
423482
else:
424483
assert self.start is not None
425484
assert self.end is not None
485+
self.miss_count += 1
426486

427487
if start < self.start:
428488
if self.end is None or self.end - end > self.blocksize:
489+
self.total_requested_bytes += bend - start
429490
self.cache = self.fetcher(start, bend)
430491
self.start = start
431492
else:
493+
self.total_requested_bytes += self.start - start
432494
new = self.fetcher(start, self.start)
433495
self.start = start
434496
self.cache = new + self.cache
435497
elif self.end is not None and bend > self.end:
436498
if self.end > self.size:
437499
pass
438500
elif end - self.end > self.blocksize:
501+
self.total_requested_bytes += bend - start
439502
self.cache = self.fetcher(start, bend)
440503
self.start = start
441504
else:
505+
self.total_requested_bytes += bend - self.end
442506
new = self.fetcher(self.end, bend)
443507
self.cache = self.cache + new
444508

@@ -470,10 +534,13 @@ def __init__(
470534
) -> None:
471535
super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
472536
if data is None:
537+
self.miss_count += 1
538+
self.total_requested_bytes += self.size
473539
data = self.fetcher(0, self.size)
474540
self.data = data
475541

476542
def _fetch(self, start: int | None, stop: int | None) -> bytes:
543+
self.hit_count += 1
477544
return self.data[start:stop]
478545

479546

@@ -551,6 +618,7 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
551618
# are allowed to pad reads beyond the
552619
# buffer with zero
553620
out += b"\x00" * (stop - start - len(out))
621+
self.hit_count += 1
554622
return out
555623
else:
556624
# The request ends outside a known range,
@@ -572,6 +640,8 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
572640
f"IO/caching performance may be poor!"
573641
)
574642
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
643+
self.total_requested_bytes += stop - start
644+
self.miss_count += 1
575645
return out + super()._fetch(start, stop)
576646

577647

@@ -676,12 +746,6 @@ def __init__(
676746
self._fetch_future: Future[bytes] | None = None
677747
self._fetch_future_lock = threading.Lock()
678748

679-
def __repr__(self) -> str:
680-
return (
681-
f"<BackgroundBlockCache blocksize={self.blocksize}, "
682-
f"size={self.size}, nblocks={self.nblocks}>"
683-
)
684-
685749
def cache_info(self) -> UpdatableLRU.CacheInfo:
686750
"""
687751
The statistics on the block cache.
@@ -799,6 +863,8 @@ def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
799863
start = block_number * self.blocksize
800864
end = start + self.blocksize
801865
logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
866+
self.total_requested_bytes += end - start
867+
self.miss_count += 1
802868
block_contents = super()._fetch(start, end)
803869
return block_contents
804870

@@ -818,6 +884,9 @@ def _read_cache(
818884
start_pos = start % self.blocksize
819885
end_pos = end % self.blocksize
820886

887+
# kind of pointless to count this as a hit, but it is
888+
self.hit_count += 1
889+
821890
if start_block_number == end_block_number:
822891
block = self._fetch_block_cached(start_block_number)
823892
return block[start_pos:end_pos]

fsspec/spec.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1841,11 +1841,18 @@ def read(self, length=-1):
18411841
length = self.size - self.loc
18421842
if self.closed:
18431843
raise ValueError("I/O operation on closed file.")
1844-
logger.debug("%s read: %i - %i", self, self.loc, self.loc + length)
18451844
if length == 0:
18461845
# don't even bother calling fetch
18471846
return b""
18481847
out = self.cache._fetch(self.loc, self.loc + length)
1848+
1849+
logger.debug(
1850+
"%s read: %i - %i %s",
1851+
self,
1852+
self.loc,
1853+
self.loc + length,
1854+
self.cache._log_stats(),
1855+
)
18491856
self.loc += len(out)
18501857
return out
18511858

0 commit comments

Comments
 (0)