Skip to content

Commit a3d487c

Browse files
authored
Refactor cache metrics for consistency and completeness (#1721)
This PR streamlines cache metrics collection across disk and express cache implementations. This change renames caching metrics for consistency with other OTLP metrics. This change also captures latency, bytes_transferred from/to cache and errors consistently across both disk and express cache implementations. ### Does this change impact existing behavior? Yes, updates cache metrics in logs ### Does this change need a changelog entry? Does it require a version change? Yes --- By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license and I agree to the terms of the [Developer Certificate of Origin (DCO)](https://developercertificate.org/). --------- Signed-off-by: Sahitya Damera <sahityad@amazon.com>
1 parent 63d79f6 commit a3d487c

7 files changed

Lines changed: 107 additions & 89 deletions

File tree

mountpoint-s3-fs/CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
## Unreleased (v0.8.4)
22

33
* Add metric to track cache hit rate in logs. ([#1716](https://github.com/awslabs/mountpoint-s3/pull/1716))
4-
* Remove redundant cache merics in logs. ([#1716](https://github.com/awslabs/mountpoint-s3/pull/1716))
4+
* Remove redundant cache merics in logs. ([#1716](https://github.com/awslabs/mountpoint-s3/pull/1716), [#1721](https://github.com/awslabs/mountpoint-s3/pull/1721))
5+
* Rename cache metrics for consistency. ([#1721](https://github.com/awslabs/mountpoint-s3/pull/1721))
56

67
## v0.8.3 (October 30, 2025)
78

mountpoint-s3-fs/src/data_cache.rs

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,6 @@ pub enum DataCacheError {
4242
EvictionFailure,
4343
}
4444

45-
impl DataCacheError {
46-
fn reason(&self) -> &'static str {
47-
match self {
48-
DataCacheError::IoFailure(_) => "io_failure",
49-
DataCacheError::InvalidBlockHeader(_) => "invalid_block_header",
50-
DataCacheError::InvalidBlockChecksum => "invalid_block_checksum",
51-
DataCacheError::InvalidBlockContent => "invalid_block_content",
52-
DataCacheError::InvalidBlockOffset => "invalid_block_offset",
53-
DataCacheError::EvictionFailure => "eviction_failure",
54-
}
55-
}
56-
}
57-
5845
pub type DataCacheResult<Value> = Result<Value, DataCacheError>;
5946

6047
/// Data cache for fixed-size checksummed buffers.

mountpoint-s3-fs/src/data_cache/disk_data_cache.rs

Lines changed: 47 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ use tracing::{trace, warn};
2121
use crate::checksums::IntegrityError;
2222
use crate::data_cache::DataCacheError;
2323
use crate::memory::{BufferKind, PagedPool};
24+
use crate::metrics::defs::{
25+
ATTR_CACHE, CACHE_DISK, CACHE_EVICT_LATENCY, CACHE_GET_ERRORS, CACHE_GET_IO_SIZE, CACHE_GET_LATENCY,
26+
CACHE_PUT_ERRORS, CACHE_PUT_IO_SIZE, CACHE_PUT_LATENCY, CACHE_TOTAL_SIZE,
27+
};
2428
use crate::object::ObjectId;
2529
use crate::sync::Mutex;
2630

@@ -387,7 +391,7 @@ impl DiskDataCache {
387391
}
388392

389393
fn is_limit_exceeded(&self, size: usize) -> bool {
390-
metrics::gauge!("disk_data_cache.disk_usage_mib").set((size / 1024 / 1024) as f64);
394+
metrics::gauge!(CACHE_TOTAL_SIZE, ATTR_CACHE => CACHE_DISK).set(size as f64);
391395
match self.config.limit {
392396
CacheLimit::Unbounded => false,
393397
CacheLimit::TotalSize { max_size } => size > max_size,
@@ -461,29 +465,27 @@ impl DataCache for DiskDataCache {
461465
let start = Instant::now();
462466
let block_key = DiskBlockKey::new(cache_key, block_idx);
463467
let path = self.get_path_for_block_key(&block_key);
464-
match self.read_block(&path, cache_key, block_idx, block_offset) {
468+
let result = match self.read_block(&path, cache_key, block_idx, block_offset) {
465469
Ok(None) => {
466470
// Cache miss.
467-
metrics::counter!("disk_data_cache.block_hit").increment(0);
468471
Ok(None)
469472
}
470473
Ok(Some(bytes)) => {
471474
// Cache hit.
472-
metrics::counter!("disk_data_cache.block_hit").increment(1);
473-
metrics::counter!("disk_data_cache.total_bytes", "type" => "read").increment(bytes.len() as u64);
474-
metrics::histogram!("disk_data_cache.read_duration_us").record(start.elapsed().as_micros() as f64);
475+
metrics::counter!(CACHE_GET_IO_SIZE, ATTR_CACHE => CACHE_DISK).increment(bytes.len() as u64);
475476
if let Some(usage) = &self.usage {
476477
usage.lock().unwrap().refresh(&block_key);
477478
}
478479
Ok(Some(bytes))
479480
}
480481
Err(err) => {
481-
// Invalid block. Count as cache miss.
482-
metrics::counter!("disk_data_cache.block_hit").increment(0);
483-
metrics::counter!("disk_data_cache.block_err").increment(1);
482+
// Invalid block.
483+
metrics::counter!(CACHE_GET_ERRORS, ATTR_CACHE => CACHE_DISK).increment(1);
484484
Err(err)
485485
}
486-
}
486+
};
487+
metrics::histogram!(CACHE_GET_LATENCY, ATTR_CACHE => CACHE_DISK).record(start.elapsed().as_micros() as f64);
488+
result
487489
}
488490

489491
async fn put_block(
@@ -497,37 +499,48 @@ impl DataCache for DiskDataCache {
497499
if block_offset != block_idx * self.config.block_size {
498500
return Err(DataCacheError::InvalidBlockOffset);
499501
}
500-
502+
let start = Instant::now();
501503
let bytes_len = bytes.len();
502504
let block_key = DiskBlockKey::new(&cache_key, block_idx);
503505
let path = self.get_path_for_block_key(&block_key);
504506
trace!(?cache_key, ?path, "new block will be created in disk cache");
505507

506-
let block = DiskBlock::new(cache_key, block_idx, block_offset, bytes).map_err(|err| match err {
507-
DiskBlockCreationError::IntegrityError(_e) => DataCacheError::InvalidBlockContent,
508-
})?;
509-
510-
{
511-
let eviction_start = Instant::now();
512-
let result = self.evict_if_needed();
513-
metrics::histogram!("disk_data_cache.eviction_duration_us")
514-
.record(eviction_start.elapsed().as_micros() as f64);
515-
result
516-
}?;
517-
518-
let write_start = Instant::now();
519-
let (temp_file, size) = self.write_block(&path, block)?;
520-
metrics::histogram!("disk_data_cache.write_duration_us").record(write_start.elapsed().as_micros() as f64);
521-
metrics::counter!("disk_data_cache.total_bytes", "type" => "write").increment(bytes_len as u64);
522-
if let Some(usage) = &self.usage {
523-
let mut usage = usage.lock().unwrap();
524-
_ = temp_file.persist(path).map_err(|e| e.error)?;
525-
usage.add(block_key, size);
508+
// Capture the put operation result separately from metrics recording
509+
// to ensure we can record both success and error metrics consistently
510+
let put_result = (|| -> DataCacheResult<()> {
511+
let block = DiskBlock::new(cache_key, block_idx, block_offset, bytes).map_err(|err| match err {
512+
DiskBlockCreationError::IntegrityError(_e) => DataCacheError::InvalidBlockContent,
513+
})?;
514+
515+
{
516+
let eviction_start = Instant::now();
517+
let result = self.evict_if_needed();
518+
metrics::histogram!(CACHE_EVICT_LATENCY, ATTR_CACHE => CACHE_DISK)
519+
.record(eviction_start.elapsed().as_micros() as f64);
520+
result
521+
}?;
522+
523+
let result = self.write_block(&path, block);
524+
let (temp_file, size) = result?;
525+
526+
if let Some(usage) = &self.usage {
527+
let mut usage = usage.lock().unwrap();
528+
_ = temp_file.persist(path).map_err(|e| e.error)?;
529+
usage.add(block_key, size);
530+
} else {
531+
_ = temp_file.persist(path).map_err(|e| e.error)?;
532+
}
533+
534+
Ok(())
535+
})();
536+
537+
if put_result.is_ok() {
538+
metrics::counter!(CACHE_PUT_IO_SIZE, ATTR_CACHE => CACHE_DISK).increment(bytes_len as u64);
526539
} else {
527-
_ = temp_file.persist(path).map_err(|e| e.error)?;
540+
metrics::counter!(CACHE_PUT_ERRORS, ATTR_CACHE => CACHE_DISK).increment(1);
528541
}
529-
530-
Ok(())
542+
metrics::histogram!(CACHE_PUT_LATENCY, ATTR_CACHE => CACHE_DISK).record(start.elapsed().as_micros() as f64);
543+
put_result
531544
}
532545

533546
fn block_size(&self) -> u64 {

mountpoint-s3-fs/src/data_cache/express_data_cache.rs

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
use super::{BlockIndex, ChecksummedBytes, DataCache, DataCacheError, DataCacheResult};
22
use crate::ServerSideEncryption;
3+
use crate::metrics::defs::{
4+
ATTR_CACHE, CACHE_EXPRESS, CACHE_GET_ERRORS, CACHE_GET_IO_SIZE, CACHE_GET_LATENCY, CACHE_OVERSIZED_OBJECTS,
5+
CACHE_PUT_ERRORS, CACHE_PUT_IO_SIZE, CACHE_PUT_LATENCY,
6+
};
37
use crate::object::ObjectId;
48
use std::collections::HashMap;
59
use std::time::Instant;
@@ -167,7 +171,7 @@ where
167171
object_size: usize,
168172
) -> DataCacheResult<Option<ChecksummedBytes>> {
169173
if object_size > self.config.max_object_size {
170-
metrics::counter!("express_data_cache.over_max_object_size", "type" => "read").increment(1);
174+
metrics::counter!(CACHE_OVERSIZED_OBJECTS, ATTR_CACHE => CACHE_EXPRESS).increment(1);
171175
return Ok(None);
172176
}
173177

@@ -259,7 +263,7 @@ where
259263
object_size: usize,
260264
) -> DataCacheResult<()> {
261265
if object_size > self.config.max_object_size {
262-
metrics::counter!("express_data_cache.over_max_object_size", "type" => "write").increment(1);
266+
metrics::counter!(CACHE_OVERSIZED_OBJECTS, ATTR_CACHE => CACHE_EXPRESS).increment(1);
263267
return Ok(());
264268
}
265269

@@ -296,25 +300,18 @@ where
296300
object_size: usize,
297301
) -> DataCacheResult<Option<ChecksummedBytes>> {
298302
let start = Instant::now();
299-
let (result, result_type) = match self.read_block(cache_key, block_idx, block_offset, object_size).await {
303+
let result = match self.read_block(cache_key, block_idx, block_offset, object_size).await {
300304
Ok(Some(data)) => {
301-
metrics::counter!("express_data_cache.block_hit").increment(1);
302-
metrics::counter!("express_data_cache.total_bytes", "type" => "read").increment(data.len() as u64);
303-
(Ok(Some(data)), "ok")
304-
}
305-
Ok(None) => {
306-
metrics::counter!("express_data_cache.block_hit").increment(0);
307-
(Ok(None), "miss")
305+
metrics::counter!(CACHE_GET_IO_SIZE, ATTR_CACHE => CACHE_EXPRESS).increment(data.len() as u64);
306+
Ok(Some(data))
308307
}
308+
Ok(None) => Ok(None),
309309
Err(err) => {
310-
metrics::counter!("express_data_cache.block_hit").increment(0);
311-
metrics::counter!("express_data_cache.block_err", "reason" => err.reason(), "type" => "read")
312-
.increment(1);
313-
(Err(err), "error")
310+
metrics::counter!(CACHE_GET_ERRORS, ATTR_CACHE => CACHE_EXPRESS).increment(1);
311+
Err(err)
314312
}
315313
};
316-
metrics::histogram!("express_data_cache.read_duration_us", "type" => result_type)
317-
.record(start.elapsed().as_micros() as f64);
314+
metrics::histogram!(CACHE_GET_LATENCY, ATTR_CACHE => CACHE_EXPRESS).record(start.elapsed().as_micros() as f64);
318315
result
319316
}
320317

@@ -327,22 +324,21 @@ where
327324
object_size: usize,
328325
) -> DataCacheResult<()> {
329326
let start = Instant::now();
330-
let (result, result_type) = match self
327+
let bytes_len = bytes.len();
328+
let result = match self
331329
.write_block(cache_key, block_idx, block_offset, bytes, object_size)
332330
.await
333331
{
334332
Ok(()) => {
335-
metrics::counter!("express_data_cache.total_bytes", "type" => "write").increment(object_size as u64);
336-
(Ok(()), "ok")
333+
metrics::counter!(CACHE_PUT_IO_SIZE, ATTR_CACHE => CACHE_EXPRESS).increment(bytes_len as u64);
334+
Ok(())
337335
}
338336
Err(err) => {
339-
metrics::counter!("express_data_cache.block_err", "reason" => err.reason(), "type" => "write")
340-
.increment(1);
341-
(Err(err), "error")
337+
metrics::counter!(CACHE_PUT_ERRORS, ATTR_CACHE => CACHE_EXPRESS).increment(1);
338+
Err(err)
342339
}
343340
};
344-
metrics::histogram!("express_data_cache.write_duration_us", "type" => result_type)
345-
.record(start.elapsed().as_micros() as f64);
341+
metrics::histogram!(CACHE_PUT_LATENCY, ATTR_CACHE => CACHE_EXPRESS).record(start.elapsed().as_micros() as f64);
346342
result
347343
}
348344

mountpoint-s3-fs/src/metrics/defs.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,25 @@ pub const PROCESS_MEMORY_USAGE: &str = "process.memory_usage";
2929

3030
pub const PREFETCH_RESET_STATE: &str = "prefetch.reset_state";
3131

32+
// Cache metric constants
33+
pub const CACHE_GET_IO_SIZE: &str = "cache.get_io_size";
34+
pub const CACHE_PUT_IO_SIZE: &str = "cache.put_io_size";
35+
pub const CACHE_GET_LATENCY: &str = "cache.get_latency";
36+
pub const CACHE_PUT_LATENCY: &str = "cache.put_latency";
37+
pub const CACHE_GET_ERRORS: &str = "cache.get_errors";
38+
pub const CACHE_PUT_ERRORS: &str = "cache.put_errors";
39+
pub const CACHE_TOTAL_SIZE: &str = "cache.total_size";
40+
pub const CACHE_EVICT_LATENCY: &str = "cache.evict_latency";
41+
pub const CACHE_BLOCK_HIT: &str = "cache.block_hit";
42+
pub const CACHE_OVERSIZED_OBJECTS: &str = "cache.oversized_objects";
43+
3244
// Attribute constants
3345
pub const ATTR_FUSE_REQUEST: &str = "fuse_request";
46+
pub const ATTR_CACHE: &str = "cache";
47+
48+
// Cache type constants
49+
pub const CACHE_DISK: &str = "disk";
50+
pub const CACHE_EXPRESS: &str = "express";
3451

3552
pub fn lookup_config(name: &str) -> MetricConfig {
3653
match name {

mountpoint-s3-fs/tests/fuse_tests/metrics_test.rs

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ use mountpoint_s3_fs::S3FilesystemConfig;
77
use mountpoint_s3_fs::data_cache::{
88
DiskDataCache, DiskDataCacheConfig, ExpressDataCache, ExpressDataCacheConfig, MultilevelDataCache,
99
};
10+
use mountpoint_s3_fs::metrics::defs::{
11+
ATTR_CACHE, CACHE_DISK, CACHE_EVICT_LATENCY, CACHE_EXPRESS, CACHE_GET_IO_SIZE, CACHE_GET_LATENCY,
12+
CACHE_PUT_IO_SIZE, CACHE_PUT_LATENCY,
13+
};
14+
1015
use mountpoint_s3_fs::metrics::defs::{
1116
ATTR_FUSE_REQUEST, FUSE_IO_SIZE, FUSE_REQUEST_ERRORS, FUSE_REQUEST_LATENCY, PREFETCH_RESET_STATE,
1217
};
@@ -239,23 +244,22 @@ rusty_fork_test! {
239244
test_session.client().put_object("file.txt", &content).unwrap();
240245
let path = test_session.mount_path().join("file.txt");
241246

242-
// First read - populate cache and wait for asynchronous cache updates
247+
// First read - populate cache
243248
File::open(&path).unwrap().read_to_end(&mut Vec::new()).unwrap();
244249
sleep(Duration::from_millis(100));
245250

246251
// No cache hits but cache should be updated
247252
assert!(recorder.get("fuse.cache_hit", &[]).is_none(), "Cache hit metric should not exist after cache miss");
248-
assert_metric_exists(&recorder, "disk_data_cache.write_duration_us", &[]);
249-
assert_metric_exists(&recorder, "disk_data_cache.total_bytes", &[("type", "write")]);
253+
assert_metric_exists(&recorder, CACHE_PUT_LATENCY, &[(ATTR_CACHE, CACHE_DISK)]);
254+
assert_metric_exists(&recorder, CACHE_PUT_IO_SIZE, &[(ATTR_CACHE, CACHE_DISK)]);
250255

251256
// Second read - cache hit
252257
File::open(&path).unwrap().read_to_end(&mut Vec::new()).unwrap();
253258

254259
// Verify cache hit metrics
255260
assert_metric_exists(&recorder, "fuse.cache_hit", &[]);
256-
assert_metric_exists(&recorder, "disk_data_cache.block_hit", &[]);
257-
assert_metric_exists(&recorder, "disk_data_cache.read_duration_us", &[]);
258-
assert_metric_exists(&recorder, "disk_data_cache.total_bytes", &[("type", "read")]);
261+
assert_metric_exists(&recorder, CACHE_GET_LATENCY, &[(ATTR_CACHE, CACHE_DISK)]);
262+
assert_metric_exists(&recorder, CACHE_GET_IO_SIZE, &[(ATTR_CACHE, CACHE_DISK)]);
259263

260264
// Read a large file to trigger eviction metrics
261265
let large_content = vec![b'z'; 2 * 1024 * 1024];
@@ -265,7 +269,7 @@ rusty_fork_test! {
265269
sleep(Duration::from_millis(100));
266270

267271
// Verify eviction metrics
268-
assert_metric_exists(&recorder, "disk_data_cache.eviction_duration_us", &[]);
272+
assert_metric_exists(&recorder, CACHE_EVICT_LATENCY, &[(ATTR_CACHE, CACHE_DISK)]);
269273
}
270274

271275
#[test]
@@ -304,22 +308,21 @@ rusty_fork_test! {
304308
sleep(Duration::from_millis(100));
305309

306310
// Verify cache population metrics
307-
// Currently, disk_data_cache.read_duration_us is only recorded on cache hits, not misses.
308311
// But they are recorded for express.
309312
assert!(recorder.get("fuse.cache_hit", &[]).is_none(), "Cache hit metric should not exist after cache miss");
310-
assert_metric_exists(&recorder, "disk_data_cache.write_duration_us", &[]);
311-
assert_metric_exists(&recorder, "disk_data_cache.total_bytes", &[("type", "write")]);
312-
assert_metric_exists(&recorder, "express_data_cache.write_duration_us", &[("type", "ok")]);
313-
assert_metric_exists(&recorder, "express_data_cache.total_bytes", &[("type", "write")]);
314-
assert_metric_exists(&recorder, "express_data_cache.read_duration_us", &[("type", "miss")]);
313+
assert_metric_exists(&recorder, CACHE_PUT_LATENCY, &[(ATTR_CACHE, CACHE_DISK)]);
314+
assert_metric_exists(&recorder, CACHE_PUT_IO_SIZE, &[(ATTR_CACHE, CACHE_DISK)]);
315+
assert_metric_exists(&recorder, CACHE_GET_LATENCY, &[(ATTR_CACHE, CACHE_DISK)]);
316+
assert_metric_exists(&recorder, CACHE_PUT_LATENCY, &[(ATTR_CACHE, CACHE_EXPRESS)]);
317+
assert_metric_exists(&recorder, CACHE_PUT_IO_SIZE, &[(ATTR_CACHE, CACHE_EXPRESS)]);
318+
assert_metric_exists(&recorder, CACHE_GET_LATENCY, &[(ATTR_CACHE, CACHE_EXPRESS)]);
315319

316320
// Second read should hit disk cache after async write completes
317321
File::open(&path).unwrap().read_to_end(&mut Vec::new()).unwrap();
318322
sleep(Duration::from_millis(100));
319323

320324
assert_metric_exists(&recorder, "fuse.cache_hit", &[]);
321-
assert_metric_exists(&recorder, "disk_data_cache.block_hit", &[]);
322-
assert_metric_exists(&recorder, "disk_data_cache.read_duration_us", &[]);
323-
assert_metric_exists(&recorder, "disk_data_cache.total_bytes", &[("type", "read")]);
325+
assert_metric_exists(&recorder, CACHE_GET_LATENCY, &[(ATTR_CACHE, CACHE_DISK)]);
326+
assert_metric_exists(&recorder, CACHE_GET_IO_SIZE, &[(ATTR_CACHE, CACHE_DISK)]);
324327
}
325328
}

mountpoint-s3/CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
## Unreleased (v1.22.0)
22

33
* Add metric to track cache hit rate in logs. ([#1716](https://github.com/awslabs/mountpoint-s3/pull/1716))
4-
* Remove redundant cache merics in logs. ([#1716](https://github.com/awslabs/mountpoint-s3/pull/1716))
4+
* Remove redundant cache merics in logs. ([#1716](https://github.com/awslabs/mountpoint-s3/pull/1716), [#1721](https://github.com/awslabs/mountpoint-s3/pull/1721))
5+
* Rename cache metrics for consistency. ([#1721](https://github.com/awslabs/mountpoint-s3/pull/1721))
56

67
## v1.21.0 (Oct 30, 2025)
78

0 commit comments

Comments
 (0)