|
3 | 3 |
|
4 | 4 | //! Node gauge registration for `/metrics/nodes` (Layer 1b). |
5 | 5 |
|
| 6 | +use prometheus_client::encoding::EncodeLabelSet; |
| 7 | +use prometheus_client::metrics::family::Family; |
| 8 | +use prometheus_client::metrics::gauge::Gauge; |
6 | 9 | use prometheus_client::registry::Registry; |
| 10 | +use spur_core::node::NodeState; |
7 | 11 | use spur_core::config::MetricsExpositionFormat; |
| 12 | +use std::sync::atomic::AtomicU64; |
8 | 13 |
|
9 | 14 | use crate::export::encode_registered; |
| 15 | +use crate::node::{node_state_metric_suffix, NodeMetricsSnapshot}; |
10 | 16 |
|
11 | | -/// Register node catalog gauges (stub until `NodeMetricsSnapshot` exists). |
12 | | -pub fn register_nodes(_registry: &mut Registry) {} |
| 17 | +#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)] |
| 18 | +struct NodeLabel { |
| 19 | + node: String, |
| 20 | +} |
| 21 | + |
| 22 | +fn register_gauge(registry: &mut Registry, name: &str, help: &str, value: u64) { |
| 23 | + let gauge = Gauge::<u64, AtomicU64>::default(); |
| 24 | + gauge.set(value); |
| 25 | + registry.register(name, help, gauge); |
| 26 | +} |
| 27 | + |
| 28 | +fn set_family_gauge(family: &Family<NodeLabel, Gauge<u64, AtomicU64>>, node: &str, value: u64) { |
| 29 | + let gauge = family.get_or_create(&NodeLabel { |
| 30 | + node: node.to_string(), |
| 31 | + }); |
| 32 | + gauge.set(value); |
| 33 | +} |
| 34 | + |
| 35 | +/// Register node catalog gauges into `registry` from `snap`. |
| 36 | +pub fn register_nodes(registry: &mut Registry, snap: &NodeMetricsSnapshot) { |
| 37 | + register_gauge(registry, "spur_nodes", "Total number of nodes", snap.total); |
| 38 | + |
| 39 | + for &state in &NodeState::ALL { |
| 40 | + let suffix = node_state_metric_suffix(state); |
| 41 | + let name = format!("spur_nodes_{suffix}"); |
| 42 | + let help = format!("Number of nodes in {} state", state.display()); |
| 43 | + register_gauge(registry, &name, &help, snap.count_state(state)); |
| 44 | + } |
| 45 | + |
| 46 | + register_gauge( |
| 47 | + registry, |
| 48 | + "spur_nodes_cpus", |
| 49 | + "Total CPUs across all nodes", |
| 50 | + snap.total_cpus, |
| 51 | + ); |
| 52 | + register_gauge( |
| 53 | + registry, |
| 54 | + "spur_nodes_cpus_alloc", |
| 55 | + "Total CPUs allocated across all nodes", |
| 56 | + snap.alloc_cpus, |
| 57 | + ); |
| 58 | + register_gauge( |
| 59 | + registry, |
| 60 | + "spur_nodes_memory_bytes", |
| 61 | + "Total memory in bytes across all nodes", |
| 62 | + snap.total_memory_bytes, |
| 63 | + ); |
| 64 | + register_gauge( |
| 65 | + registry, |
| 66 | + "spur_nodes_memory_alloc_bytes", |
| 67 | + "Total memory in bytes allocated across all nodes", |
| 68 | + snap.alloc_memory_bytes, |
| 69 | + ); |
| 70 | + register_gauge( |
| 71 | + registry, |
| 72 | + "spur_nodes_gpus", |
| 73 | + "Total GPUs across all nodes", |
| 74 | + snap.total_gpus, |
| 75 | + ); |
| 76 | + register_gauge( |
| 77 | + registry, |
| 78 | + "spur_nodes_gpus_alloc", |
| 79 | + "Total GPUs allocated across all nodes", |
| 80 | + snap.alloc_gpus, |
| 81 | + ); |
| 82 | + |
| 83 | + let node_cpus = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 84 | + let node_cpus_alloc = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 85 | + let node_memory_bytes = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 86 | + let node_memory_alloc_bytes = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 87 | + let node_gpus = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 88 | + let node_gpus_alloc = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 89 | + let node_cpu_load = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 90 | + let node_free_memory_bytes = Family::<NodeLabel, Gauge<u64, AtomicU64>>::default(); |
| 91 | + |
| 92 | + for node in &snap.per_node { |
| 93 | + set_family_gauge(&node_cpus, &node.name, node.total_cpus); |
| 94 | + set_family_gauge(&node_cpus_alloc, &node.name, node.alloc_cpus); |
| 95 | + set_family_gauge(&node_memory_bytes, &node.name, node.total_memory_bytes); |
| 96 | + set_family_gauge( |
| 97 | + &node_memory_alloc_bytes, |
| 98 | + &node.name, |
| 99 | + node.alloc_memory_bytes, |
| 100 | + ); |
| 101 | + set_family_gauge(&node_gpus, &node.name, node.total_gpus); |
| 102 | + set_family_gauge(&node_gpus_alloc, &node.name, node.alloc_gpus); |
| 103 | + set_family_gauge(&node_cpu_load, &node.name, node.cpu_load); |
| 104 | + set_family_gauge( |
| 105 | + &node_free_memory_bytes, |
| 106 | + &node.name, |
| 107 | + node.free_memory_bytes, |
| 108 | + ); |
| 109 | + } |
| 110 | + |
| 111 | + registry.register( |
| 112 | + "spur_node_cpus", |
| 113 | + "CPUs on the specified node", |
| 114 | + node_cpus, |
| 115 | + ); |
| 116 | + registry.register( |
| 117 | + "spur_node_cpus_alloc", |
| 118 | + "CPUs allocated on the specified node", |
| 119 | + node_cpus_alloc, |
| 120 | + ); |
| 121 | + registry.register( |
| 122 | + "spur_node_memory_bytes", |
| 123 | + "Memory in bytes on the specified node", |
| 124 | + node_memory_bytes, |
| 125 | + ); |
| 126 | + registry.register( |
| 127 | + "spur_node_memory_alloc_bytes", |
| 128 | + "Memory in bytes allocated on the specified node", |
| 129 | + node_memory_alloc_bytes, |
| 130 | + ); |
| 131 | + registry.register("spur_node_gpus", "GPUs on the specified node", node_gpus); |
| 132 | + registry.register( |
| 133 | + "spur_node_gpus_alloc", |
| 134 | + "GPUs allocated on the specified node", |
| 135 | + node_gpus_alloc, |
| 136 | + ); |
| 137 | + registry.register( |
| 138 | + "spur_node_cpu_load", |
| 139 | + "CPU load reported by the node agent", |
| 140 | + node_cpu_load, |
| 141 | + ); |
| 142 | + registry.register( |
| 143 | + "spur_node_free_memory_bytes", |
| 144 | + "Free memory in bytes reported by the node agent", |
| 145 | + node_free_memory_bytes, |
| 146 | + ); |
| 147 | +} |
13 | 148 |
|
14 | 149 | /// Encode node metrics for `/metrics/nodes`. |
15 | | -pub fn encode_nodes_metrics(format: MetricsExpositionFormat) -> String { |
16 | | - encode_registered(register_nodes, format) |
| 150 | +pub fn encode_nodes_metrics_with_format( |
| 151 | + snap: &NodeMetricsSnapshot, |
| 152 | + format: MetricsExpositionFormat, |
| 153 | +) -> String { |
| 154 | + encode_registered(|registry| register_nodes(registry, snap), format) |
| 155 | +} |
| 156 | + |
| 157 | +/// Encode node metrics for `/metrics/nodes` (default: Slurm 0.0.4 text). |
| 158 | +pub fn encode_nodes_metrics(snap: &NodeMetricsSnapshot) -> String { |
| 159 | + encode_nodes_metrics_with_format(snap, MetricsExpositionFormat::default()) |
17 | 160 | } |
18 | 161 |
|
19 | 162 | #[cfg(test)] |
20 | 163 | mod tests { |
21 | 164 | use super::*; |
| 165 | + use crate::node::NodeMetricsSnapshot; |
| 166 | + use spur_core::node::Node; |
| 167 | + use spur_core::resource::{GpuLinkType, GpuResource, ResourceSet}; |
22 | 168 |
|
23 | 169 | #[test] |
24 | 170 | fn empty_nodes_export_slurm_has_no_samples() { |
25 | | - let body = encode_nodes_metrics(MetricsExpositionFormat::Slurm_0_0_4); |
| 171 | + let body = encode_nodes_metrics_with_format( |
| 172 | + &NodeMetricsSnapshot::default(), |
| 173 | + MetricsExpositionFormat::Slurm_0_0_4, |
| 174 | + ); |
26 | 175 | assert!(!body.contains("spur_")); |
27 | 176 | assert!(!body.contains("# EOF")); |
28 | 177 | } |
29 | 178 |
|
30 | 179 | #[test] |
31 | 180 | fn empty_nodes_export_openmetrics_has_eof_only() { |
32 | | - let body = encode_nodes_metrics(MetricsExpositionFormat::OpenMetrics_1_0); |
| 181 | + let body = encode_nodes_metrics_with_format( |
| 182 | + &NodeMetricsSnapshot::default(), |
| 183 | + MetricsExpositionFormat::OpenMetrics_1_0, |
| 184 | + ); |
33 | 185 | assert_eq!(body, "# EOF\n"); |
34 | 186 | } |
| 187 | + |
| 188 | + fn resources(cpus: u32, memory_mb: u64, gpu_count: u32) -> ResourceSet { |
| 189 | + let mut gpus = Vec::new(); |
| 190 | + for i in 0..gpu_count { |
| 191 | + gpus.push(GpuResource { |
| 192 | + device_id: i, |
| 193 | + gpu_type: "mi300x".into(), |
| 194 | + memory_mb: 0, |
| 195 | + peer_gpus: vec![], |
| 196 | + link_type: GpuLinkType::XGMI, |
| 197 | + }); |
| 198 | + } |
| 199 | + ResourceSet { |
| 200 | + cpus, |
| 201 | + memory_mb, |
| 202 | + gpus, |
| 203 | + generic: Default::default(), |
| 204 | + } |
| 205 | + } |
| 206 | + |
| 207 | + #[test] |
| 208 | + fn export_contains_core_gauges_and_per_node_families() { |
| 209 | + let mut n1 = Node::new("node-a".into(), resources(8, 16384, 2)); |
| 210 | + n1.state = NodeState::Idle; |
| 211 | + n1.cpu_load = 12; |
| 212 | + n1.free_memory_mb = 4096; |
| 213 | + |
| 214 | + let mut n2 = Node::new("node-b".into(), resources(4, 8192, 0)); |
| 215 | + n2.state = NodeState::Allocated; |
| 216 | + n2.alloc_resources = resources(2, 4096, 0); |
| 217 | + |
| 218 | + let snap = NodeMetricsSnapshot::collect([&n1, &n2]); |
| 219 | + let body = encode_nodes_metrics_with_format(&snap, MetricsExpositionFormat::Slurm_0_0_4); |
| 220 | + |
| 221 | + assert!(body.contains("# HELP spur_nodes ")); |
| 222 | + assert!(body.contains("spur_nodes 2\n")); |
| 223 | + assert!(body.contains("spur_nodes_idle 1\n")); |
| 224 | + assert!(body.contains("spur_nodes_alloc 1\n")); |
| 225 | + assert!(body.contains("spur_nodes_cpus 12\n")); |
| 226 | + assert!(body.contains("spur_nodes_cpus_alloc 2\n")); |
| 227 | + |
| 228 | + assert!(body.contains("spur_node_cpus{node=\"node-a\"} 8\n")); |
| 229 | + assert!(body.contains("spur_node_cpus{node=\"node-b\"} 4\n")); |
| 230 | + assert!(body.contains("spur_node_cpus_alloc{node=\"node-b\"} 2\n")); |
| 231 | + assert!(body.contains("spur_node_cpu_load{node=\"node-a\"} 12\n")); |
| 232 | + assert!(body.contains("spur_node_free_memory_bytes{node=\"node-a\"} 4294967296\n")); |
| 233 | + } |
35 | 234 | } |
0 commit comments