Skip to content

Commit 7118cc2

Browse files
committed
fix prometheus histogram count and sum to be cumulative
resetting timers and resetting samples now track cumulative count and sum across snapshots so that prometheus _count and _sum metrics monotonically increase as required by the prometheus spec. the per-window values and percentiles still reset normally.
1 parent d8cb8a9 commit 7118cc2

File tree

5 files changed

+184
-3
lines changed

5 files changed

+184
-3
lines changed

metrics/prometheus/collector.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,12 @@ func (c *collector) addTimer(name string, m *metrics.TimerSnapshot) {
122122
}
123123

124124
func (c *collector) addResettingTimer(name string, m *metrics.ResettingTimerSnapshot) {
125-
if m.Count() <= 0 {
125+
if m.TotalCount() <= 0 {
126126
return
127127
}
128128
pv := []float64{0.5, 0.75, 0.95, 0.99, 0.999, 0.9999}
129129
ps := m.Percentiles(pv)
130-
c.writeSummaryCounter(name, m.Count())
130+
c.writeSummaryCounter(name, m.TotalCount())
131131
c.buff.WriteString(fmt.Sprintf(typeSummaryTpl, mutateKey(name)))
132132
for i := range pv {
133133
c.writeSummaryPercentile(name, strconv.FormatFloat(pv[i], 'f', -1, 64), ps[i])

metrics/prometheus/collector_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"os"
2222
"strings"
2323
"testing"
24+
"time"
2425

2526
"github.com/ethereum/go-ethereum/metrics"
2627
"github.com/ethereum/go-ethereum/metrics/internal"
@@ -51,6 +52,49 @@ func TestCollector(t *testing.T) {
5152
}
5253
}
5354

55+
func TestResettingTimerCumulativePrometheus(t *testing.T) {
56+
registry := metrics.NewRegistry()
57+
timer := metrics.NewRegisteredResettingTimer("test/resetting", registry)
58+
59+
// First batch of updates.
60+
timer.Update(10 * time.Millisecond)
61+
timer.Update(20 * time.Millisecond)
62+
63+
// First scrape.
64+
c1 := newCollector()
65+
registry.Each(func(name string, i interface{}) {
66+
c1.Add(name, i)
67+
})
68+
out1 := c1.buff.String()
69+
if !strings.Contains(out1, "test_resetting_count 2") {
70+
t.Fatalf("first scrape should have count 2, got:\n%s", out1)
71+
}
72+
73+
// Second batch.
74+
timer.Update(30 * time.Millisecond)
75+
76+
// Second scrape - count should be cumulative (3, not 1).
77+
c2 := newCollector()
78+
registry.Each(func(name string, i interface{}) {
79+
c2.Add(name, i)
80+
})
81+
out2 := c2.buff.String()
82+
if !strings.Contains(out2, "test_resetting_count 3") {
83+
t.Fatalf("second scrape should have cumulative count 3, got:\n%s", out2)
84+
}
85+
86+
// Third scrape with no new updates - count should stay at 3.
87+
c3 := newCollector()
88+
registry.Each(func(name string, i interface{}) {
89+
c3.Add(name, i)
90+
})
91+
out3 := c3.buff.String()
92+
// With no new events and totalCount > 0, we still need to report.
93+
if !strings.Contains(out3, "test_resetting_count 3") {
94+
t.Fatalf("third scrape should still report cumulative count 3, got:\n%s", out3)
95+
}
96+
}
97+
5498
func findFirstDiffPos(a, b string) string {
5599
yy := strings.Split(b, "\n")
56100
for i, x := range strings.Split(a, "\n") {

metrics/resetting_sample.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package metrics
22

3+
import "sync"
4+
35
// ResettingSample converts an ordinary sample into one that resets whenever its
46
// snapshot is retrieved. This will break for multi-monitor systems, but when only
57
// a single metric is being pushed out, this ensure that low-frequency events don't
@@ -14,11 +16,29 @@ func ResettingSample(sample Sample) Sample {
1416
// snapshot retrieval.
1517
type resettingSample struct {
1618
Sample
19+
mutex sync.Mutex
20+
totalCount int64 // cumulative count across all snapshots
21+
totalSum int64 // cumulative sum across all snapshots
1722
}
1823

1924
// Snapshot returns a read-only copy of the sample with the original reset.
25+
// The returned snapshot has cumulative count and sum values that monotonically
26+
// increase across resets, as required by the Prometheus counter convention.
2027
func (rs *resettingSample) Snapshot() *sampleSnapshot {
28+
rs.mutex.Lock()
29+
defer rs.mutex.Unlock()
30+
2131
s := rs.Sample.Snapshot()
2232
rs.Sample.Clear()
33+
34+
// Accumulate cumulative totals from this snapshot's window.
35+
rs.totalCount += s.count
36+
rs.totalSum += s.sum
37+
38+
// Override count and sum with cumulative values so that Prometheus
39+
// _count and _sum are monotonically increasing counters.
40+
s.count = rs.totalCount
41+
s.sum = rs.totalSum
42+
2343
return s
2444
}

metrics/resetting_timer.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,25 @@ type ResettingTimer struct {
3636
values []int64
3737
sum int64 // sum is a running count of the total sum, used later to calculate mean
3838

39+
totalCount int64 // cumulative count across all snapshots
40+
totalSum int64 // cumulative sum across all snapshots
41+
3942
mutex sync.Mutex
4043
}
4144

4245
// Snapshot resets the timer and returns a read-only copy of its contents.
4346
func (t *ResettingTimer) Snapshot() *ResettingTimerSnapshot {
4447
t.mutex.Lock()
4548
defer t.mutex.Unlock()
46-
snapshot := &ResettingTimerSnapshot{}
49+
50+
// Accumulate cumulative totals before resetting.
51+
t.totalCount += int64(len(t.values))
52+
t.totalSum += t.sum
53+
54+
snapshot := &ResettingTimerSnapshot{
55+
totalCount: t.totalCount,
56+
totalSum: t.totalSum,
57+
}
4758
if len(t.values) > 0 {
4859
snapshot.mean = float64(t.sum) / float64(len(t.values))
4960
snapshot.values = t.values
@@ -84,13 +95,25 @@ type ResettingTimerSnapshot struct {
8495
min int64
8596
thresholdBoundaries []float64
8697
calculated bool
98+
totalCount int64 // cumulative count across all snapshots
99+
totalSum int64 // cumulative sum across all snapshots
87100
}
88101

89102
// Count return the length of the values from snapshot.
90103
func (t *ResettingTimerSnapshot) Count() int {
91104
return len(t.values)
92105
}
93106

107+
// TotalCount returns the cumulative count of events across all snapshots.
108+
func (t *ResettingTimerSnapshot) TotalCount() int64 {
109+
return t.totalCount
110+
}
111+
112+
// TotalSum returns the cumulative sum of event durations across all snapshots.
113+
func (t *ResettingTimerSnapshot) TotalSum() int64 {
114+
return t.totalSum
115+
}
116+
94117
// Percentiles returns the boundaries for the input percentiles.
95118
// note: this method is not thread safe
96119
func (t *ResettingTimerSnapshot) Percentiles(percentiles []float64) []float64 {

metrics/resetting_timer_test.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,100 @@ import (
55
"time"
66
)
77

8+
func TestResettingTimerCumulativeCountAndSum(t *testing.T) {
9+
timer := NewResettingTimer()
10+
11+
// First batch of updates.
12+
timer.Update(10 * time.Millisecond)
13+
timer.Update(20 * time.Millisecond)
14+
timer.Update(30 * time.Millisecond)
15+
16+
snap1 := timer.Snapshot()
17+
if have, want := snap1.Count(), 3; have != want {
18+
t.Fatalf("snap1 count: have %d, want %d", have, want)
19+
}
20+
if have, want := snap1.TotalCount(), int64(3); have != want {
21+
t.Fatalf("snap1 total count: have %d, want %d", have, want)
22+
}
23+
wantSum1 := int64(10*time.Millisecond + 20*time.Millisecond + 30*time.Millisecond)
24+
if have := snap1.TotalSum(); have != wantSum1 {
25+
t.Fatalf("snap1 total sum: have %d, want %d", have, wantSum1)
26+
}
27+
28+
// Second batch of updates (after snapshot reset the values).
29+
timer.Update(40 * time.Millisecond)
30+
timer.Update(50 * time.Millisecond)
31+
32+
snap2 := timer.Snapshot()
33+
// Per-window count should be 2.
34+
if have, want := snap2.Count(), 2; have != want {
35+
t.Fatalf("snap2 count: have %d, want %d", have, want)
36+
}
37+
// Cumulative count should be 5 (3 + 2).
38+
if have, want := snap2.TotalCount(), int64(5); have != want {
39+
t.Fatalf("snap2 total count: have %d, want %d", have, want)
40+
}
41+
// Cumulative sum should include both batches.
42+
wantSum2 := wantSum1 + int64(40*time.Millisecond+50*time.Millisecond)
43+
if have := snap2.TotalSum(); have != wantSum2 {
44+
t.Fatalf("snap2 total sum: have %d, want %d", have, wantSum2)
45+
}
46+
47+
// Empty snapshot should still report the same cumulative totals.
48+
snap3 := timer.Snapshot()
49+
if have, want := snap3.Count(), 0; have != want {
50+
t.Fatalf("snap3 count: have %d, want %d", have, want)
51+
}
52+
if have, want := snap3.TotalCount(), int64(5); have != want {
53+
t.Fatalf("snap3 total count: have %d, want %d", have, want)
54+
}
55+
if have := snap3.TotalSum(); have != wantSum2 {
56+
t.Fatalf("snap3 total sum: have %d, want %d", have, wantSum2)
57+
}
58+
}
59+
60+
func TestResettingSampleCumulativeCountAndSum(t *testing.T) {
61+
Enable()
62+
63+
s := ResettingSample(NewUniformSample(100))
64+
65+
// First batch.
66+
s.Update(10)
67+
s.Update(20)
68+
s.Update(30)
69+
70+
snap1 := s.Snapshot()
71+
if have, want := snap1.Count(), int64(3); have != want {
72+
t.Fatalf("snap1 count: have %d, want %d", have, want)
73+
}
74+
if have, want := snap1.Sum(), int64(60); have != want {
75+
t.Fatalf("snap1 sum: have %d, want %d", have, want)
76+
}
77+
78+
// Second batch.
79+
s.Update(40)
80+
s.Update(50)
81+
82+
snap2 := s.Snapshot()
83+
// Count should be cumulative: 3 + 2 = 5.
84+
if have, want := snap2.Count(), int64(5); have != want {
85+
t.Fatalf("snap2 count: have %d, want %d", have, want)
86+
}
87+
// Sum should be cumulative: 60 + 90 = 150.
88+
if have, want := snap2.Sum(), int64(150); have != want {
89+
t.Fatalf("snap2 sum: have %d, want %d", have, want)
90+
}
91+
92+
// Empty snapshot should still report cumulative totals.
93+
snap3 := s.Snapshot()
94+
if have, want := snap3.Count(), int64(5); have != want {
95+
t.Fatalf("snap3 count: have %d, want %d", have, want)
96+
}
97+
if have, want := snap3.Sum(), int64(150); have != want {
98+
t.Fatalf("snap3 sum: have %d, want %d", have, want)
99+
}
100+
}
101+
8102
func TestResettingTimer(t *testing.T) {
9103
tests := []struct {
10104
values []int64

0 commit comments

Comments
 (0)