Skip to content

Commit ca2e918

Browse files
mcollinaclaude
andauthored
fix: add cgroup CFS throttling metrics to OTel meter (#1114)
Register four observable instruments on the existing MeterProvider to surface container CPU bandwidth-control state so GC pauses can be correlated with throttling in our observability backend: - process.cpu.cfs.periods (counter) - process.cpu.cfs.throttled_periods (counter) - process.cpu.cfs.throttled_time (counter, ns) - process.cpu.cfs.throttled_ratio (gauge, delta-based) Supports cgroup v1 and v2, short-circuits cleanly on non-Linux and when cpu.stat is unreadable, and reads the file from the OTel observable callback (no separate timer). Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 9a13d3e commit ca2e918

4 files changed

Lines changed: 361 additions & 0 deletions

File tree

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import * as fs from 'node:fs'
2+
import { Meter } from '@opentelemetry/api'
3+
import { afterEach, beforeEach, describe, expect, Mock, test, vi } from 'vitest'
4+
import { installCgroupCpuMetrics, parseCpuStat } from './cgroup-cpu-metrics'
5+
6+
vi.mock('@internal/monitoring/logger', () => ({
7+
logger: { debug: vi.fn(), info: vi.fn(), warn: vi.fn(), error: vi.fn() },
8+
logSchema: {
9+
info: vi.fn(),
10+
warning: vi.fn(),
11+
error: vi.fn(),
12+
event: vi.fn(),
13+
request: vi.fn(),
14+
},
15+
}))
16+
17+
vi.mock('node:fs', async () => {
18+
const actual = await vi.importActual<typeof import('node:fs')>('node:fs')
19+
return {
20+
...actual,
21+
existsSync: vi.fn(),
22+
readFileSync: vi.fn(),
23+
}
24+
})
25+
26+
const existsSync = fs.existsSync as unknown as Mock
27+
const readFileSync = fs.readFileSync as unknown as Mock
28+
29+
const V2_BLOB = `usage_usec 1234567
30+
user_usec 1000000
31+
system_usec 234567
32+
nr_periods 1000
33+
nr_throttled 25
34+
throttled_usec 5000
35+
`
36+
37+
const V1_BLOB = `nr_periods 500
38+
nr_throttled 12
39+
throttled_time 7500000
40+
`
41+
42+
interface CapturedInstrument {
43+
name: string
44+
callback: (observable: { observe: (value: number) => void }) => void
45+
}
46+
47+
interface CapturedObservation {
48+
name: string
49+
value: number
50+
}
51+
52+
function createMockMeter(): {
53+
meter: Meter
54+
instruments: CapturedInstrument[]
55+
invoke: (name: string) => CapturedObservation[]
56+
} {
57+
const instruments: CapturedInstrument[] = []
58+
59+
const make = () => (name: string) => ({
60+
addCallback(callback: CapturedInstrument['callback']) {
61+
instruments.push({ name, callback })
62+
return this
63+
},
64+
})
65+
66+
const meter = {
67+
createObservableCounter: make(),
68+
createObservableGauge: make(),
69+
} as unknown as Meter
70+
71+
const invoke = (name: string): CapturedObservation[] => {
72+
const observations: CapturedObservation[] = []
73+
const observable = { observe: (value: number) => observations.push({ name, value }) }
74+
for (const inst of instruments) {
75+
if (inst.name === name) inst.callback(observable)
76+
}
77+
return observations
78+
}
79+
80+
return { meter, instruments, invoke }
81+
}
82+
83+
describe('parseCpuStat', () => {
84+
test('parses cgroup v2 blob and converts throttled_usec to ns', () => {
85+
expect(parseCpuStat(V2_BLOB, 'v2')).toEqual({
86+
nr_periods: 1000,
87+
nr_throttled: 25,
88+
throttled_time_ns: 5_000_000,
89+
})
90+
})
91+
92+
test('parses cgroup v1 blob and keeps throttled_time as ns', () => {
93+
expect(parseCpuStat(V1_BLOB, 'v1')).toEqual({
94+
nr_periods: 500,
95+
nr_throttled: 12,
96+
throttled_time_ns: 7_500_000,
97+
})
98+
})
99+
100+
test('returns null when required fields are missing', () => {
101+
expect(parseCpuStat('nr_periods 10\nnr_throttled 1\n', 'v2')).toBeNull()
102+
})
103+
})
104+
105+
describe('installCgroupCpuMetrics', () => {
106+
const originalPlatform = process.platform
107+
108+
beforeEach(() => {
109+
existsSync.mockReset()
110+
readFileSync.mockReset()
111+
})
112+
113+
afterEach(() => {
114+
Object.defineProperty(process, 'platform', { value: originalPlatform })
115+
})
116+
117+
test('short-circuits on non-Linux platforms and registers no instruments', () => {
118+
Object.defineProperty(process, 'platform', { value: 'darwin' })
119+
120+
const { meter, instruments } = createMockMeter()
121+
installCgroupCpuMetrics(meter)
122+
123+
expect(instruments).toHaveLength(0)
124+
expect(readFileSync).not.toHaveBeenCalled()
125+
expect(existsSync).not.toHaveBeenCalled()
126+
})
127+
128+
test('emits 0 for throttled_ratio on the first sample (divide-by-zero guard)', () => {
129+
Object.defineProperty(process, 'platform', { value: 'linux' })
130+
existsSync.mockReturnValue(true)
131+
readFileSync.mockReturnValue(V2_BLOB)
132+
133+
const { meter, invoke } = createMockMeter()
134+
installCgroupCpuMetrics(meter)
135+
136+
expect(invoke('process.cpu.cfs.throttled_ratio')).toEqual([
137+
{ name: 'process.cpu.cfs.throttled_ratio', value: 0 },
138+
])
139+
})
140+
141+
test('computes throttled_ratio between observations and avoids NaN when no new periods', () => {
142+
Object.defineProperty(process, 'platform', { value: 'linux' })
143+
existsSync.mockReturnValue(true)
144+
readFileSync.mockReturnValue(V2_BLOB)
145+
146+
const { meter, invoke } = createMockMeter()
147+
installCgroupCpuMetrics(meter)
148+
149+
// First sample → 0
150+
invoke('process.cpu.cfs.throttled_ratio')
151+
152+
// Second sample: +100 periods, +5 throttled → 0.05
153+
readFileSync.mockReturnValue(`nr_periods 1100
154+
nr_throttled 30
155+
throttled_usec 5000
156+
`)
157+
const second = invoke('process.cpu.cfs.throttled_ratio')
158+
expect(second).toEqual([{ name: 'process.cpu.cfs.throttled_ratio', value: 0.05 }])
159+
160+
// Third sample identical → no new periods → 0, not NaN
161+
const third = invoke('process.cpu.cfs.throttled_ratio')
162+
expect(third).toEqual([{ name: 'process.cpu.cfs.throttled_ratio', value: 0 }])
163+
})
164+
165+
test('observes counter values parsed from cpu.stat', () => {
166+
Object.defineProperty(process, 'platform', { value: 'linux' })
167+
existsSync.mockReturnValue(true)
168+
readFileSync.mockReturnValue(V2_BLOB)
169+
170+
const { meter, invoke } = createMockMeter()
171+
installCgroupCpuMetrics(meter)
172+
173+
expect(invoke('process.cpu.cfs.periods')).toEqual([
174+
{ name: 'process.cpu.cfs.periods', value: 1000 },
175+
])
176+
expect(invoke('process.cpu.cfs.throttled_periods')).toEqual([
177+
{ name: 'process.cpu.cfs.throttled_periods', value: 25 },
178+
])
179+
expect(invoke('process.cpu.cfs.throttled_time')).toEqual([
180+
{ name: 'process.cpu.cfs.throttled_time', value: 5_000_000 },
181+
])
182+
})
183+
})
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
// Project-local metric names — these are NOT part of the OpenTelemetry semantic
2+
// conventions. The names `process.cpu.cfs.*` describe Linux cgroup CFS (Completely
3+
// Fair Scheduler) bandwidth-control state and are kept stable for our backends.
4+
5+
import * as fs from 'node:fs'
6+
import { logger, logSchema } from '@internal/monitoring/logger'
7+
import { Meter } from '@opentelemetry/api'
8+
9+
const V2_CONTROLLERS = '/sys/fs/cgroup/cgroup.controllers'
10+
const V2_STAT = '/sys/fs/cgroup/cpu.stat'
11+
const V1_PRIMARY = '/sys/fs/cgroup/cpu,cpuacct/cpu.stat'
12+
const V1_FALLBACK = '/sys/fs/cgroup/cpu/cpu.stat'
13+
14+
export type CgroupVersion = 'v1' | 'v2'
15+
16+
export interface CgroupCpuStatSample {
17+
nr_periods: number
18+
nr_throttled: number
19+
throttled_time_ns: number
20+
}
21+
22+
export interface CgroupSource {
23+
version: CgroupVersion
24+
path: string
25+
}
26+
27+
export function parseCpuStat(content: string, version: CgroupVersion): CgroupCpuStatSample | null {
28+
let nrPeriods: number | null = null
29+
let nrThrottled: number | null = null
30+
let throttledNs: number | null = null
31+
32+
for (const rawLine of content.split('\n')) {
33+
const line = rawLine.trim()
34+
if (!line) continue
35+
const space = line.indexOf(' ')
36+
if (space === -1) continue
37+
const key = line.slice(0, space)
38+
const value = Number(line.slice(space + 1).trim())
39+
if (!Number.isFinite(value)) continue
40+
41+
if (key === 'nr_periods') {
42+
nrPeriods = value
43+
} else if (key === 'nr_throttled') {
44+
nrThrottled = value
45+
} else if (version === 'v2' && key === 'throttled_usec') {
46+
throttledNs = value * 1000
47+
} else if (version === 'v1' && key === 'throttled_time') {
48+
throttledNs = value
49+
}
50+
}
51+
52+
if (nrPeriods === null || nrThrottled === null || throttledNs === null) {
53+
return null
54+
}
55+
return {
56+
nr_periods: nrPeriods,
57+
nr_throttled: nrThrottled,
58+
throttled_time_ns: throttledNs,
59+
}
60+
}
61+
62+
export function detectCgroupSource(): CgroupSource | null {
63+
if (process.platform !== 'linux') return null
64+
65+
try {
66+
if (fs.existsSync(V2_CONTROLLERS)) {
67+
fs.readFileSync(V2_STAT, 'utf8')
68+
return { version: 'v2', path: V2_STAT }
69+
}
70+
} catch {
71+
// Fall through to v1 candidates
72+
}
73+
74+
for (const path of [V1_PRIMARY, V1_FALLBACK]) {
75+
try {
76+
fs.readFileSync(path, 'utf8')
77+
return { version: 'v1', path }
78+
} catch {
79+
// Try next
80+
}
81+
}
82+
83+
return null
84+
}
85+
86+
export function installCgroupCpuMetrics(meter: Meter): void {
87+
const source = detectCgroupSource()
88+
if (!source) {
89+
logger.debug(
90+
{ type: 'cgroup-cpu-metrics', platform: process.platform },
91+
'[cgroup CPU metrics] cgroup cpu.stat not available, skipping'
92+
)
93+
return
94+
}
95+
96+
let previous: { nr_periods: number; nr_throttled: number } | null = null
97+
let readErrorLogged = false
98+
99+
const readSample = (): CgroupCpuStatSample | null => {
100+
try {
101+
const content = fs.readFileSync(source.path, 'utf8')
102+
return parseCpuStat(content, source.version)
103+
} catch (error) {
104+
if (!readErrorLogged) {
105+
readErrorLogged = true
106+
logSchema.warning(logger, '[cgroup CPU metrics] Failed to read cpu.stat', {
107+
type: 'cgroup-cpu-metrics',
108+
error,
109+
})
110+
}
111+
return null
112+
}
113+
}
114+
115+
meter
116+
.createObservableCounter('process.cpu.cfs.periods', {
117+
description: 'Total CFS periods elapsed for the cgroup',
118+
unit: '{period}',
119+
})
120+
.addCallback((observable) => {
121+
const sample = readSample()
122+
if (sample) observable.observe(sample.nr_periods)
123+
})
124+
125+
meter
126+
.createObservableCounter('process.cpu.cfs.throttled_periods', {
127+
description: 'CFS periods where the cgroup was throttled',
128+
unit: '{period}',
129+
})
130+
.addCallback((observable) => {
131+
const sample = readSample()
132+
if (sample) observable.observe(sample.nr_throttled)
133+
})
134+
135+
meter
136+
.createObservableCounter('process.cpu.cfs.throttled_time', {
137+
description: 'Total time the cgroup was throttled, in nanoseconds',
138+
unit: 'ns',
139+
})
140+
.addCallback((observable) => {
141+
const sample = readSample()
142+
if (sample) observable.observe(sample.throttled_time_ns)
143+
})
144+
145+
meter
146+
.createObservableGauge('process.cpu.cfs.throttled_ratio', {
147+
description: 'Fraction of recent CFS periods that were throttled',
148+
unit: '1',
149+
})
150+
.addCallback((observable) => {
151+
const sample = readSample()
152+
if (!sample) return
153+
if (!previous) {
154+
previous = { nr_periods: sample.nr_periods, nr_throttled: sample.nr_throttled }
155+
observable.observe(0)
156+
return
157+
}
158+
const dPeriods = sample.nr_periods - previous.nr_periods
159+
const dThrottled = sample.nr_throttled - previous.nr_throttled
160+
previous = { nr_periods: sample.nr_periods, nr_throttled: sample.nr_throttled }
161+
const ratio = dPeriods > 0 ? dThrottled / dPeriods : 0
162+
observable.observe(Number.isFinite(ratio) ? ratio : 0)
163+
})
164+
}

0 commit comments

Comments
 (0)