Skip to content

Commit c2df331

Browse files
Fix Prometheus gauge collision for cache_size metric (#9729)
## What Changed The replication progress cache was reusing `MutableStateCacheTypeTagValue` as its `cache_type` metrics tag, causing its `cache_size` gauge to overwrite the mutable state cache's gauge via Prometheus last-write-wins semantics. This made `cache_size{cache_type="mutablestate"}` always report `128000` (the replication progress cache's default size) instead of the actual mutable state cache capacity — particularly misleading when `cacheSizeBasedLimit: true` is configured. ## Fix - Add a new `ReplicationProgressCacheTypeTagValue = "replication_progress"` constant in `common/metrics/metric_defs.go` - Use it in `service/history/replication/progress_cache.go` instead of `MutableStateCacheTypeTagValue` - Add a unit test verifying the progress cache uses a distinct `cache_type` tag value Both caches now report their `cache_size` independently via distinct `cache_type` label values. ## Why This is a one-line behavioral change (plus the new constant and test). It only affects metric label values — no logic, API surface, or persistence changes. Fixes #9600 Co-authored-by: Jiechen Zhong <jiechen.zhong@temporal.io>
1 parent 7e96477 commit c2df331

3 files changed

Lines changed: 40 additions & 1 deletion

File tree

common/metrics/metric_defs.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ const (
5151
ReactivationSignalDedupCacheTypeTagValue = "reactivation_signal_dedup"
5252
RoutingInfoCacheTypeTagValue = "routing_info"
5353
NexusEndpointRegistryReadThroughCacheTypeTagValue = "nexus_endpoint_registry_readthrough"
54+
ReplicationProgressCacheTypeTagValue = "replication_progress"
5455

5556
InvalidHistoryURITagValue = "invalid_history_uri"
5657
InvalidVisibilityURITagValue = "invalid_visibility_uri"

service/history/replication/progress_cache.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ func NewProgressCache(
5858
TTL: config.ReplicationProgressCacheTTL(),
5959
}
6060
return &progressCacheImpl{
61-
cache: cache.NewWithMetrics(maxSize, opts, handler.WithTags(metrics.CacheTypeTag(metrics.MutableStateCacheTypeTagValue))),
61+
cache: cache.NewWithMetrics(maxSize, opts, handler.WithTags(metrics.CacheTypeTag(metrics.ReplicationProgressCacheTypeTagValue))),
6262
}
6363
}
6464

service/history/replication/progress_cache_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
historyspb "go.temporal.io/server/api/history/v1"
1111
persistencespb "go.temporal.io/server/api/persistence/v1"
1212
"go.temporal.io/server/common/metrics"
13+
"go.temporal.io/server/common/metrics/metricstest"
1314
"go.temporal.io/server/common/persistence/versionhistory"
1415
"go.temporal.io/server/common/testing/protorequire"
1516
"go.temporal.io/server/service/history/shard"
@@ -161,3 +162,40 @@ func (s *progressCacheSuite) TestProgressCache() {
161162
cachedProgress = s.progressCache.Get(s.runID, targetClusterID)
162163
s.DeepEqual(expected3, cachedProgress)
163164
}
165+
166+
func TestProgressCacheUsesDistinctMetricsTag(t *testing.T) {
167+
// Verify that the progress cache uses its own cache_type tag value
168+
// ("replication_progress") rather than the mutable state cache's tag value
169+
// ("mutablestate"), preventing Prometheus gauge collision. See #9600.
170+
ctrl := gomock.NewController(t)
171+
mockShard := shard.NewTestContext(
172+
ctrl,
173+
&persistencespb.ShardInfo{
174+
ShardId: 0,
175+
RangeId: 1,
176+
},
177+
tests.NewDynamicConfig(),
178+
)
179+
defer ctrl.Finish()
180+
defer mockShard.StopForTest()
181+
182+
metricsHandler := metricstest.NewCaptureHandler()
183+
capture := metricsHandler.StartCapture()
184+
185+
NewProgressCache(mockShard.GetConfig(), mockShard.GetLogger(), metricsHandler)
186+
187+
snapshot := capture.Snapshot()
188+
cacheSizeRecordings := snapshot[metrics.CacheSize.Name()]
189+
require.NotEmpty(t, cacheSizeRecordings, "expected cache_size metric to be recorded")
190+
191+
// Verify the cache_type tag is "replication_progress", not "mutablestate"
192+
found := false
193+
for _, recording := range cacheSizeRecordings {
194+
if tag, ok := recording.Tags[metrics.CacheTypeTagName]; ok {
195+
require.Equal(t, metrics.ReplicationProgressCacheTypeTagValue, tag,
196+
"progress cache should use ReplicationProgressCacheTypeTagValue, not MutableStateCacheTypeTagValue")
197+
found = true
198+
}
199+
}
200+
require.True(t, found, "expected cache_size metric with cache_type tag")
201+
}

0 commit comments

Comments
 (0)