diff --git a/packages/dd-trace/src/runtime_metrics/index.js b/packages/dd-trace/src/runtime_metrics/index.js index 9b2602844e7..7a1c6cbb4c8 100644 --- a/packages/dd-trace/src/runtime_metrics/index.js +++ b/packages/dd-trace/src/runtime_metrics/index.js @@ -17,7 +17,11 @@ module.exports = { start (config) { if (!config?.runtimeMetrics.enabled) return - runtimeMetrics = require('./runtime_metrics') + // Use OTLP runtime metrics with OTel-native naming when the OTel metrics + // pipeline is active. DogStatsD runtime metrics are skipped to avoid double-reporting. + runtimeMetrics = config.otelMetricsEnabled + ? require('./otlp_runtime_metrics') + : require('./runtime_metrics') Object.setPrototypeOf(module.exports, runtimeMetrics) diff --git a/packages/dd-trace/src/runtime_metrics/otlp_runtime_metrics.js b/packages/dd-trace/src/runtime_metrics/otlp_runtime_metrics.js new file mode 100644 index 00000000000..315aaddb920 --- /dev/null +++ b/packages/dd-trace/src/runtime_metrics/otlp_runtime_metrics.js @@ -0,0 +1,236 @@ +'use strict' + +// OTLP runtime metrics with OTel-native naming for Node.js +// +// OTel Node.js runtime metrics conventions: +// - v8js.memory.heap.* (V8 heap metrics) +// - nodejs.eventloop.delay.* (event loop delay) +// - process.cpu.utilization (CPU usage) +// - process.memory.usage (RSS memory) +// +// Semantic-core equivalence mappings: +// https://github.com/DataDog/semantic-core/blob/main/sor/domains/metrics/ +// integrations/nodejs/_equivalence/otel_dd.yaml + +const v8 = require('node:v8') +const process = require('node:process') +const os = require('node:os') +const { performance, monitorEventLoopDelay } = require('node:perf_hooks') +const log = require('../log') + +const METER_NAME = 'datadog.runtime_metrics' + +let meter = null +let observableRegistration = null +let eventLoopHistogram = null +let lastCpuUsage = null +let lastTime = 0 + +module.exports = { + start (config) { + this.stop() + + try { + const { metrics } = require('@opentelemetry/api') + const meterProvider = metrics.getMeterProvider() + + if (!meterProvider) { + log.error('OTLP runtime metrics: MeterProvider not available, OTel metrics pipeline may not be initialized.') + return + } + + meter = meterProvider.getMeter(METER_NAME) + + // Initialize CPU tracking + lastCpuUsage = process.cpuUsage() + lastTime = performance.now() + + // Initialize event loop delay monitoring + const trackEventLoop = config.runtimeMetrics?.eventLoop !== false + if (trackEventLoop && monitorEventLoopDelay) { + eventLoopHistogram = monitorEventLoopDelay({ resolution: 4 }) + eventLoopHistogram.enable() + } + + // --- V8 Heap Metrics --- + // v8js.memory.heap.used - V8 heap used + // Maps to: runtime.node.heap.used_size.by.space (via semantic-core) + const heapUsed = meter.createObservableGauge('v8js.memory.heap.used', { + unit: 'By', + description: 'V8 heap memory used.', + }) + + // v8js.memory.heap.limit - V8 heap size limit + // Maps to: runtime.node.heap.size.by.space + const heapLimit = meter.createObservableGauge('v8js.memory.heap.limit', { + unit: 'By', + description: 'V8 heap memory total available size.', + }) + + // v8js.memory.heap.space.available_size - Available size per heap space + // Maps to: runtime.node.heap.available_size.by.space + const heapSpaceAvailable = meter.createObservableGauge('v8js.memory.heap.space.available_size', { + unit: 'By', + description: 'V8 heap space available size.', + }) + + // v8js.memory.heap.space.physical_size - Physical size per heap space + // Maps to: runtime.node.heap.physical_size.by.space + const heapSpacePhysical = meter.createObservableGauge('v8js.memory.heap.space.physical_size', { + unit: 'By', + description: 'V8 heap space physical size.', + }) + + // --- Process Metrics --- + // process.memory.usage - RSS memory + // Maps to: runtime.node.mem.rss + const memoryUsage = meter.createObservableGauge('process.memory.usage', { + unit: 'By', + description: 'Process resident set size (RSS).', + }) + + // process.cpu.utilization - CPU utilization + // Maps to: runtime.node.cpu.user / runtime.node.cpu.system + // Attributes: process.cpu.state = {user, system} + const cpuUtilization = meter.createObservableGauge('process.cpu.utilization', { + unit: '1', + description: + 'Difference in process.cpu.time since the last measurement, ' + + 'divided by the elapsed time and number of CPUs available to the process.', + }) + + // --- Event Loop Metrics --- + const eventLoopDelayMin = trackEventLoop + ? meter.createObservableGauge('nodejs.eventloop.delay.min', { + unit: 'ns', + description: 'Event loop minimum delay.', + }) + : null + + const eventLoopDelayMax = trackEventLoop + ? meter.createObservableGauge('nodejs.eventloop.delay.max', { + unit: 'ns', + description: 'Event loop maximum delay.', + }) + : null + + const eventLoopDelayMean = trackEventLoop + ? meter.createObservableGauge('nodejs.eventloop.delay.mean', { + unit: 'ns', + description: 'Event loop mean delay.', + }) + : null + + const eventLoopDelayP50 = trackEventLoop + ? meter.createObservableGauge('nodejs.eventloop.delay.p50', { + unit: 'ns', + description: 'Event loop 50th percentile delay.', + }) + : null + + const eventLoopDelayP90 = trackEventLoop + ? meter.createObservableGauge('nodejs.eventloop.delay.p90', { + unit: 'ns', + description: 'Event loop 90th percentile delay.', + }) + : null + + const eventLoopDelayP99 = trackEventLoop + ? meter.createObservableGauge('nodejs.eventloop.delay.p99', { + unit: 'ns', + description: 'Event loop 99th percentile delay.', + }) + : null + + // nodejs.eventloop.utilization — the one that IS mapped in semantic-core + const eventLoopUtilization = trackEventLoop && performance.eventLoopUtilization + ? meter.createObservableGauge('nodejs.eventloop.utilization', { + unit: '1', + description: 'Event loop utilization ratio.', + }) + : null + + // Register batch callback for all observable instruments + const observables = [ + heapUsed, heapLimit, heapSpaceAvailable, heapSpacePhysical, + memoryUsage, cpuUtilization, + ] + if (trackEventLoop) { + observables.push( + eventLoopDelayMin, eventLoopDelayMax, eventLoopDelayMean, + eventLoopDelayP50, eventLoopDelayP90, eventLoopDelayP99, + ) + if (eventLoopUtilization) observables.push(eventLoopUtilization) + } + + observableRegistration = meter.addBatchObservableCallback((observer) => { + // V8 heap statistics + const heapStats = v8.getHeapStatistics() + observer.observe(heapUsed, heapStats.used_heap_size) + observer.observe(heapLimit, heapStats.heap_size_limit) + + // V8 heap space statistics (with v8js.heap.space.name attribute) + const heapSpaces = v8.getHeapSpaceStatistics() + for (const space of heapSpaces) { + const attrs = { 'v8js.heap.space.name': space.space_name } + observer.observe(heapSpaceAvailable, space.space_available_size, attrs) + observer.observe(heapSpacePhysical, space.physical_space_size, attrs) + } + + // Process memory (RSS) + const mem = process.memoryUsage() + observer.observe(memoryUsage, mem.rss) + + // CPU utilization + const now = performance.now() + const elapsed = (now - lastTime) / 1000 // seconds + const cpuUsage = process.cpuUsage() + const numCpus = os.cpus().length + + if (elapsed > 0 && lastCpuUsage) { + const userDelta = (cpuUsage.user - lastCpuUsage.user) / 1e6 // microseconds to seconds + const systemDelta = (cpuUsage.system - lastCpuUsage.system) / 1e6 + observer.observe(cpuUtilization, userDelta / (elapsed * numCpus), { 'process.cpu.state': 'user' }) + observer.observe(cpuUtilization, systemDelta / (elapsed * numCpus), { 'process.cpu.state': 'system' }) + } + + lastCpuUsage = cpuUsage + lastTime = now + + // Event loop delay + if (trackEventLoop && eventLoopHistogram) { + observer.observe(eventLoopDelayMin, eventLoopHistogram.min * 1e6) // ms to ns + observer.observe(eventLoopDelayMax, eventLoopHistogram.max * 1e6) + observer.observe(eventLoopDelayMean, eventLoopHistogram.mean * 1e6) + observer.observe(eventLoopDelayP50, eventLoopHistogram.percentile(50) * 1e6) + observer.observe(eventLoopDelayP90, eventLoopHistogram.percentile(90) * 1e6) + observer.observe(eventLoopDelayP99, eventLoopHistogram.percentile(99) * 1e6) + eventLoopHistogram.reset() + } + + // Event loop utilization (the one mapped in semantic-core) + if (eventLoopUtilization && performance.eventLoopUtilization) { + const elu = performance.eventLoopUtilization() + observer.observe(eventLoopUtilization, elu.utilization) + } + }, observables) + + log.debug('Started OTLP runtime metrics with OTel-native naming (v8js.*, nodejs.*, process.*)') + } catch (err) { + log.error('Failed to start OTLP runtime metrics:', err) + } + }, + + stop () { + if (observableRegistration) { + observableRegistration = null + } + if (eventLoopHistogram) { + eventLoopHistogram.disable() + eventLoopHistogram = null + } + meter = null + lastCpuUsage = null + lastTime = 0 + }, +} diff --git a/packages/dd-trace/test/runtime_metrics/otlp_runtime_metrics.spec.js b/packages/dd-trace/test/runtime_metrics/otlp_runtime_metrics.spec.js new file mode 100644 index 00000000000..37104296e95 --- /dev/null +++ b/packages/dd-trace/test/runtime_metrics/otlp_runtime_metrics.spec.js @@ -0,0 +1,182 @@ +'use strict' + +const assert = require('node:assert/strict') +const { describe, it, beforeEach, afterEach } = require('mocha') +const proxyquire = require('proxyquire').noCallThru() + +describe('otlp_runtime_metrics', () => { + let otlpMetrics + let mockMeter + let mockMeterProvider + let observeCallbacks + let createdGauges + + beforeEach(() => { + observeCallbacks = [] + createdGauges = {} + + mockMeter = { + createObservableGauge (name, opts) { + const gauge = { name, opts } + createdGauges[name] = gauge + return gauge + }, + addBatchObservableCallback (callback, observables) { + observeCallbacks.push(callback) + return {} + }, + } + + mockMeterProvider = { + getMeter (name) { + return mockMeter + }, + } + + otlpMetrics = proxyquire('../../src/runtime_metrics/otlp_runtime_metrics', { + '@opentelemetry/api': { + metrics: { + getMeterProvider () { + return mockMeterProvider + }, + }, + }, + '../log': { + debug () {}, + error () {}, + }, + }) + }) + + afterEach(() => { + otlpMetrics.stop() + }) + + it('should create OTel-native metric instruments', () => { + otlpMetrics.start({ runtimeMetrics: { eventLoop: true } }) + + // V8 heap metrics + assert.ok(createdGauges['v8js.memory.heap.used'], 'v8js.memory.heap.used should be created') + assert.ok(createdGauges['v8js.memory.heap.limit'], 'v8js.memory.heap.limit should be created') + assert.ok( + createdGauges['v8js.memory.heap.space.available_size'], + 'v8js.memory.heap.space.available_size should be created' + ) + assert.ok( + createdGauges['v8js.memory.heap.space.physical_size'], + 'v8js.memory.heap.space.physical_size should be created' + ) + + // Process metrics + assert.ok(createdGauges['process.memory.usage'], 'process.memory.usage should be created') + assert.ok(createdGauges['process.cpu.utilization'], 'process.cpu.utilization should be created') + + // Event loop metrics + assert.ok(createdGauges['nodejs.eventloop.delay.min'], 'nodejs.eventloop.delay.min should be created') + assert.ok(createdGauges['nodejs.eventloop.delay.max'], 'nodejs.eventloop.delay.max should be created') + assert.ok(createdGauges['nodejs.eventloop.delay.mean'], 'nodejs.eventloop.delay.mean should be created') + assert.ok(createdGauges['nodejs.eventloop.delay.p50'], 'nodejs.eventloop.delay.p50 should be created') + assert.ok(createdGauges['nodejs.eventloop.delay.p90'], 'nodejs.eventloop.delay.p90 should be created') + assert.ok(createdGauges['nodejs.eventloop.delay.p99'], 'nodejs.eventloop.delay.p99 should be created') + assert.ok( + createdGauges['nodejs.eventloop.utilization'], + 'nodejs.eventloop.utilization should be created' + ) + }) + + it('should use correct units on instruments', () => { + otlpMetrics.start({ runtimeMetrics: {} }) + + assert.equal(createdGauges['v8js.memory.heap.used'].opts.unit, 'By') + assert.equal(createdGauges['process.memory.usage'].opts.unit, 'By') + assert.equal(createdGauges['process.cpu.utilization'].opts.unit, '1') + }) + + it('should register a batch callback', () => { + otlpMetrics.start({ runtimeMetrics: {} }) + assert.equal(observeCallbacks.length, 1, 'should register one batch callback') + }) + + it('should observe positive values in callback', () => { + otlpMetrics.start({ runtimeMetrics: {} }) + + const observations = [] + const observer = { + observe (instrument, value, attrs) { + observations.push({ name: instrument.name, value, attrs }) + }, + } + + // Execute the callback + observeCallbacks[0](observer) + + // Check that heap metrics were observed with positive values + const heapUsed = observations.find(o => o.name === 'v8js.memory.heap.used') + assert.ok(heapUsed, 'v8js.memory.heap.used should be observed') + assert.ok(heapUsed.value > 0, 'heap used should be positive') + + const memUsage = observations.find(o => o.name === 'process.memory.usage') + assert.ok(memUsage, 'process.memory.usage should be observed') + assert.ok(memUsage.value > 0, 'RSS should be positive') + }) + + it('should include v8js.heap.space.name attribute on heap space metrics', () => { + otlpMetrics.start({ runtimeMetrics: {} }) + + const observations = [] + const observer = { + observe (instrument, value, attrs) { + observations.push({ name: instrument.name, value, attrs }) + }, + } + + observeCallbacks[0](observer) + + const spaceMetrics = observations.filter(o => o.name === 'v8js.memory.heap.space.available_size') + assert.ok(spaceMetrics.length > 0, 'should have heap space metrics') + assert.ok(spaceMetrics.some(m => m.attrs?.['v8js.heap.space.name'] === 'new_space'), 'should have new_space') + assert.ok(spaceMetrics.some(m => m.attrs?.['v8js.heap.space.name'] === 'old_space'), 'should have old_space') + }) + + it('should include process.cpu.state attribute on CPU metrics', () => { + otlpMetrics.start({ runtimeMetrics: {} }) + + const observations = [] + const observer = { + observe (instrument, value, attrs) { + observations.push({ name: instrument.name, value, attrs }) + }, + } + + // Need two callback invocations for CPU delta (first sets baseline) + observeCallbacks[0](observer) + observations.length = 0 + observeCallbacks[0](observer) + + const cpuMetrics = observations.filter(o => o.name === 'process.cpu.utilization') + assert.ok(cpuMetrics.length === 2, 'should have user and system CPU metrics') + assert.ok(cpuMetrics.some(m => m.attrs?.['process.cpu.state'] === 'user'), 'should have cpu.state=user') + assert.ok(cpuMetrics.some(m => m.attrs?.['process.cpu.state'] === 'system'), 'should have cpu.state=system') + }) + + it('should not create event loop metrics when disabled', () => { + otlpMetrics.start({ runtimeMetrics: { eventLoop: false } }) + + assert.ok(!createdGauges['nodejs.eventloop.delay.min'], 'event loop min should not be created') + assert.ok(!createdGauges['nodejs.eventloop.delay.max'], 'event loop max should not be created') + assert.ok(!createdGauges['nodejs.eventloop.delay.p90'], 'event loop p90 should not be created') + assert.ok(!createdGauges['nodejs.eventloop.delay.p99'], 'event loop p99 should not be created') + assert.ok(!createdGauges['nodejs.eventloop.utilization'], 'event loop utilization should not be created') + }) + + it('should clean up on stop', () => { + otlpMetrics.start({ runtimeMetrics: {} }) + assert.equal(observeCallbacks.length, 1) + + otlpMetrics.stop() + // After stop, internal state should be cleared + otlpMetrics.start({ runtimeMetrics: {} }) + // Should be able to start again without issues + assert.equal(observeCallbacks.length, 2) + }) +})