Skip to content

Commit e4c5009

Browse files
committed
[KLC-2405] address PR #60 review: atomic snapshot + Unix guard + comment fix
- Add ClockSnapshot() (time.Duration, time.Time) to ntp.SyncTimer. Returns offset + lastSync under a single RLock so the polling code can publish them as a coherent pair. The previous two-call read had a microseconds-wide window where a sync could land between the two getter calls and mix offset/timestamp across two sync passes -- vanishingly improbable but cleaner to remove entirely. - Guard uint64(ts.Unix()) against negative seconds. A pre-1970 RTC after a successful sync would otherwise wrap to ~1.8e19 and silently defeat any time()-value>threshold stale-sync alarm. Now publishes 0. - Rewrite the misleading "float64 round-trip" comment on SetInt64Value. The float64 conversion exists in cmd/connector/provider, not in the /node/metrics writer (statusHandler/statusMetricsProvider.go), which formats stored int64/uint64 directly with %v. Real reason for SetInt64Value is that uint64 cannot represent negative offsets. Tests: - ntp/syncTime_test.go: TestClockSnapshotReturnsConsistentPair asserts the snapshot matches individual getters post-sync. - cmd/node/metrics/clockMetrics_test.go: TestBuildClockMetricsPollingFunc_UsesAtomicSnapshot fails the test if the polling code falls back to the legacy getters. TestBuildClockMetricsPollingFunc_NegativeUnixGuard verifies a 1969 timestamp publishes 0, not the uint64 wrap-around. Addresses review threads from @copilot and @coderabbitai on PR #60.
1 parent 0162782 commit e4c5009

6 files changed

Lines changed: 125 additions & 7 deletions

File tree

cmd/node/metrics/clockMetrics.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,26 @@ func StartClockMetricsPolling(
5353

5454
func buildClockMetricsPollingFunc(syncTimer ntp.SyncTimer) func(core.AppStatusHandler) {
5555
return func(appStatusHandler core.AppStatusHandler) {
56+
// Read offset and last-sync timestamp under a single RLock so a scrape
57+
// can't observe a fresh offset paired with a stale timestamp (or vice
58+
// versa) if a sync lands between two separate getter calls.
59+
offset, lastSync := syncTimer.ClockSnapshot()
60+
5661
// ClockOffset is signed: negative means OS clock is fast relative to NTP
57-
// servers, positive means it is slow. Route through SetInt64Value so the
58-
// sign survives the float64 round-trip done by the metrics provider.
59-
appStatusHandler.SetInt64Value(core.MetricClockOffsetNs, syncTimer.ClockOffset().Nanoseconds())
62+
// servers, positive means it is slow. SetInt64Value (not SetUInt64Value)
63+
// is required because uint64 can't represent negative values.
64+
appStatusHandler.SetInt64Value(core.MetricClockOffsetNs, offset.Nanoseconds())
6065

6166
// Zero before the first successful sync — publish 0 explicitly so
6267
// `time() - value > threshold` alarms only fire once a sync has
63-
// actually happened (avoids the "node booted but NTP is broken" alarm
64-
// firing identically to "node has been up for years without resync").
68+
// actually happened. The unix > 0 guard defends against a pathological
69+
// pre-1970 RTC: uint64(negative-int64) wraps to a huge value that
70+
// would silently keep "stale-sync" alarms quiet forever.
6571
var lastSyncUnix uint64
66-
if ts := syncTimer.LastSyncTimestamp(); !ts.IsZero() {
67-
lastSyncUnix = uint64(ts.Unix()) // #nosec G115 -- unix seconds fit in uint64 well beyond year 2262
72+
if !lastSync.IsZero() {
73+
if unix := lastSync.Unix(); unix > 0 {
74+
lastSyncUnix = uint64(unix)
75+
}
6876
}
6977
appStatusHandler.SetUInt64Value(core.MetricClockLastSyncTimestamp, lastSyncUnix)
7078
}

cmd/node/metrics/clockMetrics_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,59 @@ func TestBuildClockMetricsPollingFunc_LastSyncTimestampPublished(t *testing.T) {
167167
assert.Equal(t, uint64(syncTs.Unix()), cap.u64[core.MetricClockLastSyncTimestamp])
168168
}
169169

170+
func TestBuildClockMetricsPollingFunc_NegativeUnixGuard(t *testing.T) {
171+
// Pathological case: a non-zero timestamp whose Unix() is negative (pre-1970
172+
// RTC after a successful sync). Without the guard, uint64(negative-int64)
173+
// wraps to ~1.8e19, which would silently bypass any "stale sync" alarm
174+
// expressed as `time() - value > threshold`. Must publish 0 instead.
175+
preEpoch := time.Date(1969, 6, 1, 0, 0, 0, 0, time.UTC)
176+
require.True(t, preEpoch.Unix() < 0, "test setup: timestamp must precede 1970")
177+
178+
cap, stub := newCaptureHandler()
179+
syncTimer := &consensusMock.SyncTimerMock{
180+
LastSyncTimestampCalled: func() time.Time { return preEpoch },
181+
}
182+
183+
pollingFunc := buildClockMetricsPollingFunc(syncTimer)
184+
pollingFunc(stub)
185+
186+
cap.mu.Lock()
187+
defer cap.mu.Unlock()
188+
assert.Equal(t, uint64(0), cap.u64[core.MetricClockLastSyncTimestamp],
189+
"negative Unix seconds must publish 0, not the uint64 wrap-around")
190+
}
191+
192+
func TestBuildClockMetricsPollingFunc_UsesAtomicSnapshot(t *testing.T) {
193+
// Verify the polling func reads (offset, lastSync) via the single-RLock
194+
// snapshot, not as two independent getter calls. Forcing the legacy getters
195+
// to fail the assertion proves the snapshot path is what publishes the metrics.
196+
const offset = 1234 * time.Nanosecond
197+
syncTs := time.Unix(1_700_000_000, 0)
198+
199+
cap, stub := newCaptureHandler()
200+
syncTimer := &consensusMock.SyncTimerMock{
201+
ClockOffsetCalled: func() time.Duration {
202+
t.Errorf("ClockOffset() must not be called — polling must use ClockSnapshot()")
203+
return 0
204+
},
205+
LastSyncTimestampCalled: func() time.Time {
206+
t.Errorf("LastSyncTimestamp() must not be called — polling must use ClockSnapshot()")
207+
return time.Time{}
208+
},
209+
ClockSnapshotCalled: func() (time.Duration, time.Time) {
210+
return offset, syncTs
211+
},
212+
}
213+
214+
pollingFunc := buildClockMetricsPollingFunc(syncTimer)
215+
pollingFunc(stub)
216+
217+
cap.mu.Lock()
218+
defer cap.mu.Unlock()
219+
assert.Equal(t, offset.Nanoseconds(), cap.i64[core.MetricClockOffsetNs])
220+
assert.Equal(t, uint64(syncTs.Unix()), cap.u64[core.MetricClockLastSyncTimestamp])
221+
}
222+
170223
// TestClockMetricsAppearInPrometheusOutput is an end-to-end check that
171224
// exercises the real statusHandler + clockMetrics polling combination,
172225
// ensuring both metrics appear in the /node/metrics scrape output with the

core/consensus/mock/syncTimerMock.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
type SyncTimerMock struct {
99
ClockOffsetCalled func() time.Duration
1010
LastSyncTimestampCalled func() time.Time
11+
ClockSnapshotCalled func() (time.Duration, time.Time)
1112
CurrentTimeCalled func() time.Time
1213
}
1314

@@ -34,6 +35,18 @@ func (stm *SyncTimerMock) LastSyncTimestamp() time.Time {
3435
return time.Time{}
3536
}
3637

38+
// ClockSnapshot returns the configured (offset, lastSync) pair under a single
39+
// callback so tests can assert atomic-pair semantics. Falls back to the
40+
// individual getters when no explicit snapshot stub is set, which preserves
41+
// the existing behavior for tests that only configure one or both fields.
42+
func (stm *SyncTimerMock) ClockSnapshot() (time.Duration, time.Time) {
43+
if stm.ClockSnapshotCalled != nil {
44+
return stm.ClockSnapshotCalled()
45+
}
46+
47+
return stm.ClockOffset(), stm.LastSyncTimestamp()
48+
}
49+
3750
// FormattedCurrentTime method gets the formatted current time on which is added a given offset
3851
func (stm *SyncTimerMock) FormattedCurrentTime() string {
3952
return time.Unix(0, 0).String()

ntp/interface.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ type SyncTimer interface {
1010
StartSyncingTime()
1111
ClockOffset() time.Duration
1212
LastSyncTimestamp() time.Time
13+
ClockSnapshot() (offset time.Duration, lastSync time.Time)
1314
FormattedCurrentTime() string
1415
CurrentTime() time.Time
1516
IsInterfaceNil() bool

ntp/syncTime.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,19 @@ func (s *syncTime) LastSyncTimestamp() time.Time {
284284
return ts
285285
}
286286

287+
// ClockSnapshot returns the current offset and last-sync timestamp under a
288+
// single RLock so callers see a coherent pair. Without this, a sync landing
289+
// between two separate getter calls could mix a fresh offset with a stale
290+
// timestamp on the same scrape.
291+
func (s *syncTime) ClockSnapshot() (offset time.Duration, lastSync time.Time) {
292+
s.mut.RLock()
293+
offset = s.clockOffset
294+
lastSync = s.lastSyncTimestamp
295+
s.mut.RUnlock()
296+
297+
return
298+
}
299+
287300
// recordSyncSuccess updates offset and timestamp under a single lock so
288301
// /node/metrics scrapes can't observe a fresh offset paired with a stale
289302
// timestamp (or vice versa) mid-update.

ntp/syncTime_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,3 +398,33 @@ func TestLastSyncTimestampStampedOnSuccess(t *testing.T) {
398398
assert.False(t, ts.Before(before), "timestamp must be set on or after the sync started")
399399
assert.False(t, ts.After(after), "timestamp must not be set in the future")
400400
}
401+
402+
func TestClockSnapshotReturnsConsistentPair(t *testing.T) {
403+
t.Parallel()
404+
405+
// After a successful sync, ClockSnapshot must return the offset and
406+
// timestamp written by that same sync — they're set together under one
407+
// lock in recordSyncSuccess and must be read together for callers
408+
// publishing them as paired metrics.
409+
st := ntp.NewSyncTime(
410+
config.NTPConfig{
411+
SyncPeriodSeconds: 1,
412+
Hosts: []string{"host1"},
413+
},
414+
func(options ntp.NTPOptions, hostIndex int) (*beevikNtp.Response, error) {
415+
return &beevikNtp.Response{ClockOffset: 23456}, nil
416+
},
417+
)
418+
419+
offset, ts := st.ClockSnapshot()
420+
assert.Equal(t, time.Duration(0), offset, "no sync yet, offset is zero")
421+
assert.True(t, ts.IsZero(), "no sync yet, timestamp is zero")
422+
423+
st.Sync()
424+
425+
offset, ts = st.ClockSnapshot()
426+
assert.Equal(t, time.Duration(23456), offset)
427+
assert.False(t, ts.IsZero())
428+
assert.Equal(t, st.ClockOffset(), offset, "snapshot offset must match individual getter")
429+
assert.Equal(t, st.LastSyncTimestamp(), ts, "snapshot timestamp must match individual getter")
430+
}

0 commit comments

Comments
 (0)