Skip to content

Commit ac933e3

Browse files
committed
feat(metrics): add node metrics collection and /metrics/nodes export
Add NodeMetricsSnapshot collection from the Raft-backed node map and export it via /metrics/nodes using prometheus-client. Export cluster-level node state/resource gauges plus per-node labeled resource/telemetry gauges, and add golden fixtures/tests (with normalization for Family ordering).
1 parent 1a74ad3 commit ac933e3

9 files changed

Lines changed: 854 additions & 10 deletions

File tree

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
---
2+
description: Require explicit approval of commit messages
3+
alwaysApply: true
4+
---
5+
6+
# Git — commit message requires explicit approval
7+
8+
Whenever running any command that creates or modifies a git commit (including `git commit`, `git commit --amend`, `git commit-tree`, rebases that rewrite commits, or anything that changes commit messages), you MUST do the following:
9+
10+
- You MUST present the **exact proposed commit subject and body text** to the user first.
11+
- You MUST wait for the user to explicitly approve that exact text **in the same turn** before running the commit-rewriting command.
12+
- If the user asks to change the message, you MUST show the full revised message and get explicit approval again before proceeding.
13+
- If approval is not explicit and unambiguous, DO NOT create/amend/rewrite commits.
14+
15+
Allowed without approval:
16+
17+
- Drafting candidate commit messages in chat (no git side effects).
18+
- Read-only git commands (`git status`, `git diff`, `git log`, `git show`, etc.).

crates/spur-metrics/src/export/nodes.rs

Lines changed: 205 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,232 @@
33

44
//! Node gauge registration for `/metrics/nodes` (Layer 1b).
55
6+
use prometheus_client::encoding::EncodeLabelSet;
7+
use prometheus_client::metrics::family::Family;
8+
use prometheus_client::metrics::gauge::Gauge;
69
use prometheus_client::registry::Registry;
10+
use spur_core::node::NodeState;
711
use spur_core::config::MetricsExpositionFormat;
12+
use std::sync::atomic::AtomicU64;
813

914
use crate::export::encode_registered;
15+
use crate::node::{node_state_metric_suffix, NodeMetricsSnapshot};
1016

11-
/// Register node catalog gauges (stub until `NodeMetricsSnapshot` exists).
12-
pub fn register_nodes(_registry: &mut Registry) {}
17+
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
18+
struct NodeLabel {
19+
node: String,
20+
}
21+
22+
fn register_gauge(registry: &mut Registry, name: &str, help: &str, value: u64) {
23+
let gauge = Gauge::<u64, AtomicU64>::default();
24+
gauge.set(value);
25+
registry.register(name, help, gauge);
26+
}
27+
28+
fn set_family_gauge(family: &Family<NodeLabel, Gauge<u64, AtomicU64>>, node: &str, value: u64) {
29+
let gauge = family.get_or_create(&NodeLabel {
30+
node: node.to_string(),
31+
});
32+
gauge.set(value);
33+
}
34+
35+
/// Register node catalog gauges into `registry` from `snap`.
36+
pub fn register_nodes(registry: &mut Registry, snap: &NodeMetricsSnapshot) {
37+
register_gauge(registry, "spur_nodes", "Total number of nodes", snap.total);
38+
39+
for &state in &NodeState::ALL {
40+
let suffix = node_state_metric_suffix(state);
41+
let name = format!("spur_nodes_{suffix}");
42+
let help = format!("Number of nodes in {} state", state.display());
43+
register_gauge(registry, &name, &help, snap.count_state(state));
44+
}
45+
46+
register_gauge(
47+
registry,
48+
"spur_nodes_cpus",
49+
"Total CPUs across all nodes",
50+
snap.total_cpus,
51+
);
52+
register_gauge(
53+
registry,
54+
"spur_nodes_cpus_alloc",
55+
"Total CPUs allocated across all nodes",
56+
snap.alloc_cpus,
57+
);
58+
register_gauge(
59+
registry,
60+
"spur_nodes_memory_bytes",
61+
"Total memory in bytes across all nodes",
62+
snap.total_memory_bytes,
63+
);
64+
register_gauge(
65+
registry,
66+
"spur_nodes_memory_alloc_bytes",
67+
"Total memory in bytes allocated across all nodes",
68+
snap.alloc_memory_bytes,
69+
);
70+
register_gauge(
71+
registry,
72+
"spur_nodes_gpus",
73+
"Total GPUs across all nodes",
74+
snap.total_gpus,
75+
);
76+
register_gauge(
77+
registry,
78+
"spur_nodes_gpus_alloc",
79+
"Total GPUs allocated across all nodes",
80+
snap.alloc_gpus,
81+
);
82+
83+
let node_cpus = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
84+
let node_cpus_alloc = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
85+
let node_memory_bytes = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
86+
let node_memory_alloc_bytes = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
87+
let node_gpus = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
88+
let node_gpus_alloc = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
89+
let node_cpu_load = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
90+
let node_free_memory_bytes = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default();
91+
92+
for node in &snap.per_node {
93+
set_family_gauge(&node_cpus, &node.name, node.total_cpus);
94+
set_family_gauge(&node_cpus_alloc, &node.name, node.alloc_cpus);
95+
set_family_gauge(&node_memory_bytes, &node.name, node.total_memory_bytes);
96+
set_family_gauge(
97+
&node_memory_alloc_bytes,
98+
&node.name,
99+
node.alloc_memory_bytes,
100+
);
101+
set_family_gauge(&node_gpus, &node.name, node.total_gpus);
102+
set_family_gauge(&node_gpus_alloc, &node.name, node.alloc_gpus);
103+
set_family_gauge(&node_cpu_load, &node.name, node.cpu_load);
104+
set_family_gauge(
105+
&node_free_memory_bytes,
106+
&node.name,
107+
node.free_memory_bytes,
108+
);
109+
}
110+
111+
registry.register(
112+
"spur_node_cpus",
113+
"CPUs on the specified node",
114+
node_cpus,
115+
);
116+
registry.register(
117+
"spur_node_cpus_alloc",
118+
"CPUs allocated on the specified node",
119+
node_cpus_alloc,
120+
);
121+
registry.register(
122+
"spur_node_memory_bytes",
123+
"Memory in bytes on the specified node",
124+
node_memory_bytes,
125+
);
126+
registry.register(
127+
"spur_node_memory_alloc_bytes",
128+
"Memory in bytes allocated on the specified node",
129+
node_memory_alloc_bytes,
130+
);
131+
registry.register("spur_node_gpus", "GPUs on the specified node", node_gpus);
132+
registry.register(
133+
"spur_node_gpus_alloc",
134+
"GPUs allocated on the specified node",
135+
node_gpus_alloc,
136+
);
137+
registry.register(
138+
"spur_node_cpu_load",
139+
"CPU load reported by the node agent",
140+
node_cpu_load,
141+
);
142+
registry.register(
143+
"spur_node_free_memory_bytes",
144+
"Free memory in bytes reported by the node agent",
145+
node_free_memory_bytes,
146+
);
147+
}
13148

14149
/// Encode node metrics for `/metrics/nodes`.
15-
pub fn encode_nodes_metrics(format: MetricsExpositionFormat) -> String {
16-
encode_registered(register_nodes, format)
150+
pub fn encode_nodes_metrics_with_format(
151+
snap: &NodeMetricsSnapshot,
152+
format: MetricsExpositionFormat,
153+
) -> String {
154+
encode_registered(|registry| register_nodes(registry, snap), format)
155+
}
156+
157+
/// Encode node metrics for `/metrics/nodes` (default: Slurm 0.0.4 text).
158+
pub fn encode_nodes_metrics(snap: &NodeMetricsSnapshot) -> String {
159+
encode_nodes_metrics_with_format(snap, MetricsExpositionFormat::default())
17160
}
18161

19162
#[cfg(test)]
20163
mod tests {
21164
use super::*;
165+
use crate::node::NodeMetricsSnapshot;
166+
use spur_core::node::Node;
167+
use spur_core::resource::{GpuLinkType, GpuResource, ResourceSet};
22168

23169
#[test]
24170
fn empty_nodes_export_slurm_has_no_samples() {
25-
let body = encode_nodes_metrics(MetricsExpositionFormat::Slurm_0_0_4);
171+
let body = encode_nodes_metrics_with_format(
172+
&NodeMetricsSnapshot::default(),
173+
MetricsExpositionFormat::Slurm_0_0_4,
174+
);
26175
assert!(!body.contains("spur_"));
27176
assert!(!body.contains("# EOF"));
28177
}
29178

30179
#[test]
31180
fn empty_nodes_export_openmetrics_has_eof_only() {
32-
let body = encode_nodes_metrics(MetricsExpositionFormat::OpenMetrics_1_0);
181+
let body = encode_nodes_metrics_with_format(
182+
&NodeMetricsSnapshot::default(),
183+
MetricsExpositionFormat::OpenMetrics_1_0,
184+
);
33185
assert_eq!(body, "# EOF\n");
34186
}
187+
188+
fn resources(cpus: u32, memory_mb: u64, gpu_count: u32) -> ResourceSet {
189+
let mut gpus = Vec::new();
190+
for i in 0..gpu_count {
191+
gpus.push(GpuResource {
192+
device_id: i,
193+
gpu_type: "mi300x".into(),
194+
memory_mb: 0,
195+
peer_gpus: vec![],
196+
link_type: GpuLinkType::XGMI,
197+
});
198+
}
199+
ResourceSet {
200+
cpus,
201+
memory_mb,
202+
gpus,
203+
generic: Default::default(),
204+
}
205+
}
206+
207+
#[test]
208+
fn export_contains_core_gauges_and_per_node_families() {
209+
let mut n1 = Node::new("node-a".into(), resources(8, 16384, 2));
210+
n1.state = NodeState::Idle;
211+
n1.cpu_load = 12;
212+
n1.free_memory_mb = 4096;
213+
214+
let mut n2 = Node::new("node-b".into(), resources(4, 8192, 0));
215+
n2.state = NodeState::Allocated;
216+
n2.alloc_resources = resources(2, 4096, 0);
217+
218+
let snap = NodeMetricsSnapshot::collect([&n1, &n2]);
219+
let body = encode_nodes_metrics_with_format(&snap, MetricsExpositionFormat::Slurm_0_0_4);
220+
221+
assert!(body.contains("# HELP spur_nodes "));
222+
assert!(body.contains("spur_nodes 2\n"));
223+
assert!(body.contains("spur_nodes_idle 1\n"));
224+
assert!(body.contains("spur_nodes_alloc 1\n"));
225+
assert!(body.contains("spur_nodes_cpus 12\n"));
226+
assert!(body.contains("spur_nodes_cpus_alloc 2\n"));
227+
228+
assert!(body.contains("spur_node_cpus{node=\"node-a\"} 8\n"));
229+
assert!(body.contains("spur_node_cpus{node=\"node-b\"} 4\n"));
230+
assert!(body.contains("spur_node_cpus_alloc{node=\"node-b\"} 2\n"));
231+
assert!(body.contains("spur_node_cpu_load{node=\"node-a\"} 12\n"));
232+
assert!(body.contains("spur_node_free_memory_bytes{node=\"node-a\"} 4294967296\n"));
233+
}
35234
}

crates/spur-metrics/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55
66
pub mod export;
77
pub mod job;
8+
pub mod node;
89

910
pub use export::jobs::{
1011
encode_job_metrics, encode_job_metrics_with_format, job_state_metric_suffix,
1112
};
12-
pub use export::nodes::encode_nodes_metrics;
13+
pub use export::nodes::{encode_nodes_metrics, encode_nodes_metrics_with_format};
1314
pub use export::partitions::encode_partitions_metrics;
1415
pub use export::scheduler::encode_scheduler_metrics;
16+
pub use node::node_state_metric_suffix;
1517
pub use spur_core::config::MetricsExpositionFormat;

0 commit comments

Comments
 (0)