Skip to content

Commit 7344d5d

Browse files
feat: add counter for critical logs (#163)
1 parent 8f90e25 commit 7344d5d

6 files changed

Lines changed: 108 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

finalizer/src/actor.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,9 @@ impl<
269269
self.context.sleep(std::time::Duration::from_secs(5)).await;
270270
} else {
271271
error!(target: "critical", "finalizer started with invalid forkchoice: {forkchoice:?}, height: {}, epoch: {}", self.canonical_state.get_latest_height(), self.canonical_state.get_epoch());
272+
#[cfg(feature = "prom")]
273+
counter!("critical_errors_total", "reason" => "invalid_forkchoice", "severity" => "critical")
274+
.increment(1);
272275
panic!(
273276
"finalizer started with invalid forkchoice: {forkchoice:?}, height: {}, epoch: {}",
274277
self.canonical_state.get_latest_height(),
@@ -361,6 +364,8 @@ impl<
361364
// orchestrator, the finalizer should never receive a GetEpochGenesisHash request for the wrong epoch.
362365
if epoch != self.canonical_state.get_epoch() {
363366
error!(target: "critical", "Finalizer received epoch genesis hash request from a different epoch. This should not happen and is a bug. Our epoch: {}, requested epoch {}", self.canonical_state.get_epoch(), epoch);
367+
#[cfg(feature = "prom")]
368+
counter!("critical_errors_total", "reason" => "epoch_mismatch", "severity" => "critical").increment(1);
364369
}
365370
let _ = response.send(self.canonical_state.get_epoch_genesis_hash());
366371
},
@@ -1381,6 +1386,8 @@ async fn execute_block<
13811386
?eth_block_hash,
13821387
"block validation failed, not executing but keeping in chain"
13831388
);
1389+
#[cfg(feature = "prom")]
1390+
counter!("critical_errors_total", "reason" => "block_validation_failed", "severity" => "critical").increment(1);
13841391
}
13851392

13861393
state.set_latest_height(new_height);
@@ -1484,6 +1491,8 @@ async fn parse_execution_requests<
14841491
// The deposit contract verifies that the withdrawal credentials
14851492
// follow the expected format, so this should never happen.
14861493
error!(target: "critical", reason = "failed to parse withdrawal credentials (this is not a Summit error)", ?deposit_request);
1494+
#[cfg(feature = "prom")]
1495+
counter!("critical_errors_total", "reason" => "invalid_withdrawal_credentials", "severity" => "critical").increment(1);
14871496
warn!(
14881497
"Failed to parse withdrawal credentials: {e}"
14891498
);
@@ -1516,6 +1525,8 @@ async fn parse_execution_requests<
15161525
// The deposit contract verifies that the withdrawal credentials
15171526
// follow the expected format, so this should never happen.
15181527
error!(target: "critical", reason = "failed to parse withdrawal credentials (this is not a Summit error)", ?deposit_request);
1528+
#[cfg(feature = "prom")]
1529+
counter!("critical_errors_total", "reason" => "invalid_withdrawal_credentials", "severity" => "critical").increment(1);
15191530
warn!("Failed to parse withdrawal credentials: {e}");
15201531
continue;
15211532
}

finalizer/src/db.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ impl<E: Clock + Storage + Metrics, V: Variant> FinalizerState<E, V> {
5050
/// Log a database error and initiate graceful shutdown.
5151
fn handle_db_error(&self, e: impl std::fmt::Display, op: &str) {
5252
error!(target: "critical", %e, op, "fatal database error, initiating shutdown");
53+
#[cfg(feature = "prom")]
54+
metrics::counter!("critical_errors_total", "reason" => "fatal_db_error", "severity" => "critical").increment(1);
5355
self.cancellation_token.cancel();
5456
}
5557

node/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ tikv-jemalloc-ctl = { version = "0.6", optional = true, features = ["stats"] }
118118

119119
[target.'cfg(target_os = "linux")'.dependencies]
120120
procfs = { version = "0.17.0", optional = true }
121+
libc = { version = "0.2", optional = true }
121122

122123
[dev-dependencies]
123124
commonware-macros.workspace = true
@@ -146,6 +147,7 @@ prom = [
146147
"http",
147148
"eyre",
148149
"procfs",
150+
"libc",
149151
"summit-types/prom",
150152
]
151153
tokio-console = ["console-subscriber"]

node/src/prom/hooks.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ impl Default for HooksBuilder {
4848
Box::new(|| Collector::default().collect()),
4949
Box::new(collect_memory_stats),
5050
Box::new(collect_io_stats),
51+
Box::new(collect_disk_stats),
5152
],
5253
}
5354
}
@@ -175,3 +176,56 @@ fn collect_io_stats() {
175176

176177
#[cfg(not(target_os = "linux"))]
177178
const fn collect_io_stats() {}
179+
180+
#[cfg(target_os = "linux")]
181+
fn collect_disk_stats() {
182+
use metrics::gauge;
183+
use std::ffi::CString;
184+
use std::fs;
185+
use std::mem::MaybeUninit;
186+
use tracing::error;
187+
188+
let Ok(contents) = fs::read_to_string("/proc/mounts")
189+
.map_err(|error| error!(%error, "Failed to read /proc/mounts"))
190+
else {
191+
return;
192+
};
193+
194+
for line in contents.lines() {
195+
let fields: Vec<&str> = line.split_whitespace().collect();
196+
if fields.len() < 3 {
197+
continue;
198+
}
199+
let device = fields[0];
200+
if !device.starts_with('/') {
201+
continue;
202+
}
203+
let mount_point = fields[1];
204+
205+
let Some(c_path) = CString::new(mount_point).ok() else {
206+
continue;
207+
};
208+
209+
let mut buf = MaybeUninit::<libc::statvfs>::uninit();
210+
let ret = unsafe { libc::statvfs(c_path.as_ptr(), buf.as_mut_ptr()) };
211+
if ret != 0 {
212+
continue;
213+
}
214+
let stat = unsafe { buf.assume_init() };
215+
216+
let block_size = stat.f_frsize;
217+
let total = stat.f_blocks * block_size;
218+
let free = stat.f_bfree * block_size;
219+
let available = stat.f_bavail * block_size;
220+
let used = total.saturating_sub(free);
221+
222+
gauge!("disk.total_bytes", "mountpoint" => mount_point.to_string()).set(total as f64);
223+
gauge!("disk.free_bytes", "mountpoint" => mount_point.to_string()).set(free as f64);
224+
gauge!("disk.available_bytes", "mountpoint" => mount_point.to_string())
225+
.set(available as f64);
226+
gauge!("disk.used_bytes", "mountpoint" => mount_point.to_string()).set(used as f64);
227+
}
228+
}
229+
230+
#[cfg(not(target_os = "linux"))]
231+
const fn collect_disk_stats() {}

node/src/prom/server.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,11 @@ impl MetricServer {
6464
describe_db_metrics();
6565
describe_static_file_metrics();
6666
describe_ssz_metrics();
67+
describe_critical_error_metrics();
6768
Collector::default().describe();
6869
describe_memory_stats();
6970
describe_io_stats();
71+
describe_disk_stats();
7072

7173
Ok(())
7274
}
@@ -204,6 +206,15 @@ fn describe_ssz_metrics() {
204206
);
205207
}
206208

209+
fn describe_critical_error_metrics() {
210+
use metrics::describe_counter;
211+
212+
describe_counter!(
213+
"critical_errors_total",
214+
"Critical errors requiring immediate attention (labelled by reason)"
215+
);
216+
}
217+
207218
#[cfg(all(feature = "jemalloc", unix))]
208219
fn describe_memory_stats() {
209220
describe_gauge!(
@@ -262,6 +273,33 @@ fn describe_io_stats() {
262273
#[cfg(not(target_os = "linux"))]
263274
const fn describe_io_stats() {}
264275

276+
#[cfg(target_os = "linux")]
277+
fn describe_disk_stats() {
278+
describe_gauge!(
279+
"disk.total_bytes",
280+
Unit::Bytes,
281+
"Total size of the filesystem"
282+
);
283+
describe_gauge!(
284+
"disk.free_bytes",
285+
Unit::Bytes,
286+
"Free bytes on the filesystem (including reserved)"
287+
);
288+
describe_gauge!(
289+
"disk.available_bytes",
290+
Unit::Bytes,
291+
"Bytes available to non-root users"
292+
);
293+
describe_gauge!(
294+
"disk.used_bytes",
295+
Unit::Bytes,
296+
"Used bytes on the filesystem"
297+
);
298+
}
299+
300+
#[cfg(not(target_os = "linux"))]
301+
const fn describe_disk_stats() {}
302+
265303
#[cfg(test)]
266304
mod tests {
267305
use super::*;

0 commit comments

Comments
 (0)