Skip to content

Commit 20b082a

Browse files
authored
Test/improve testing cpu profiling fallback (#280)
* Track ctimer overhead in CI * Add a claim after buffer full in failed_claims_do_not_advance_tail * Rename disable_permanent into disarm_all_timers * Add a test to verify timer fires samples under cpu load * Add tests to verify sigsegv handler catches faulting loads in unwind * Fix ctimer fallback not triggering on wrapped perf_event_open errors * Clean up * Restore ctimer lifecycle doc * Unmap guard pages before asserts in sigsegv tests * Fix clippy warnings
1 parent 461848e commit 20b082a

12 files changed

Lines changed: 492 additions & 176 deletions

File tree

.github/workflows/benchmarks.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ jobs:
4343
- name: overhead_bench
4444
adapter: json
4545
command: cargo bench --bench overhead_bench -- --bmf 10
46+
- name: overhead_bench_ctimer
47+
adapter: json
48+
command: DIAL9_FORCE_CTIMER=1 cargo bench --bench overhead_bench -- --bmf 10
4649
- name: e2e_workload
4750
adapter: json
4851
command: cargo bench --bench e2e_workload -- --bmf 10
@@ -90,6 +93,9 @@ jobs:
9093
- name: overhead_bench
9194
adapter: json
9295
command: cargo bench --bench overhead_bench -- --bmf 10
96+
- name: overhead_bench_ctimer
97+
adapter: json
98+
command: DIAL9_FORCE_CTIMER=1 cargo bench --bench overhead_bench -- --bmf 10
9399
- name: e2e_workload
94100
adapter: json
95101
command: cargo bench --bench e2e_workload -- --bmf 10

dial9-tokio-telemetry/README.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ dial9-tokio-telemetry is designed for always-on production use, but it's still e
101101

102102
Yes, check out this [quick walkthrough (YouTube)](https://www.youtube.com/watch?v=zJOzU_6Mf7Q)!
103103

104-
The [viewer](https://dial9-tokio-telemetry.netlify.app/) (autodeployed from code in `main`) is hosted on Netlify for convenience. You can [load the demo trace](https://dial9-tokio-telemetry.netlify.app/?trace=demo-trace.bin) directly, or use [serve.py](/dial9-tokio-telemetry/serve.py) to run it locally (pure HTML and JS, client side only).
104+
The [viewer](https://dial9-tokio-telemetry.netlify.app/) (autodeployed from code in `main`) is hosted on Netlify for convenience. You can [load the demo trace](https://dial9-tokio-telemetry.netlify.app/?trace=demo-trace.bin) directly, or use [serve.py](https://github.com/dial9-rs/dial9-tokio-telemetry/blob/main/dial9-tokio-telemetry/serve.py) to run it locally (pure HTML and JS, client side only).
105105

106106
<img width="1288" height="659" alt="Screenshot 2026-03-01 at 3 52 59 PM" src="https://github.com/user-attachments/assets/77225801-70b1-4aef-b064-32bc2326b1ef" />
107107

@@ -185,7 +185,7 @@ runtime.block_on(async {
185185
# }
186186
```
187187

188-
For frameworks like Axum where you don't control the spawn call, you need to wrap the accept loop. See [`examples/metrics-service/src/axum_traced.rs`](/examples/metrics-service/src/axum_traced.rs) for a working example that wraps both the accept loop and per-connection futures.
188+
For frameworks like Axum where you don't control the spawn call, you need to wrap the accept loop. See [`examples/metrics-service/src/axum_traced.rs`](https://github.com/dial9-rs/dial9-tokio-telemetry/blob/main/examples/metrics-service/src/axum_traced.rs) for a working example that wraps both the accept loop and per-connection futures.
189189

190190
## Custom events
191191

@@ -219,7 +219,7 @@ record_event(
219219
# }
220220
```
221221

222-
For events with repeated string values (HTTP methods, endpoint paths, etc.), implement `Encodable` manually to use string interning — see [`examples/custom_events.rs`](/dial9-tokio-telemetry/examples/custom_events.rs) for a complete example showing both patterns.
222+
For events with repeated string values (HTTP methods, endpoint paths, etc.), implement `Encodable` manually to use string interning — see [`examples/custom_events.rs`](https://github.com/dial9-rs/dial9-tokio-telemetry/blob/main/dial9-tokio-telemetry/examples/custom_events.rs) for a complete example showing both patterns.
223223

224224
Custom events are encoded into the same thread-local buffer as built-in events (~100–200 ns per call) and appear in the trace viewer alongside poll/park/wake events.
225225

@@ -263,7 +263,7 @@ let (runtime, guard) = TracedRuntime::builder()
263263
# fn main() {}
264264
```
265265

266-
This pulls in [`dial9-perf-self-profile`](/perf-self-profile) for `perf_event_open` access. It records `CpuSample` events with raw stack frame addresses. When a `trace_path` is set, the background worker automatically symbolizes sealed segments (resolving addresses to function names via `/proc/self/maps` and blazesym) and gzip-compresses them on disk.
266+
This pulls in [`dial9-perf-self-profile`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/perf-self-profile) for `perf_event_open` access. It records `CpuSample` events with raw stack frame addresses. When a `trace_path` is set, the background worker automatically symbolizes sealed segments (resolving addresses to function names via `/proc/self/maps` and blazesym) and gzip-compresses them on disk.
267267

268268
#### Requirements
269269

@@ -351,7 +351,7 @@ let (io_rt, io_handle) = guard.trace_runtime("io").build(io_builder)?;
351351
# }
352352
```
353353

354-
See [`examples/thread_per_core.rs`](/dial9-tokio-telemetry/examples/thread_per_core.rs) and [`examples/multi_runtime.rs`](/dial9-tokio-telemetry/examples/multi_runtime.rs) for complete examples.
354+
See [`examples/thread_per_core.rs`](https://github.com/dial9-rs/dial9-tokio-telemetry/blob/main/dial9-tokio-telemetry/examples/thread_per_core.rs) and [`examples/multi_runtime.rs`](https://github.com/dial9-rs/dial9-tokio-telemetry/blob/main/dial9-tokio-telemetry/examples/multi_runtime.rs) for complete examples.
355355

356356
**Shutdown**: Drop all runtimes before the `TelemetryGuard` so worker threads exit and flush their thread-local buffers. For a clean shutdown that waits for the background worker (e.g. S3 uploads) to drain, call `guard.graceful_shutdown(timeout)` instead of dropping the guard.
357357

@@ -361,7 +361,7 @@ See [`examples/thread_per_core.rs`](/dial9-tokio-telemetry/examples/thread_per_c
361361

362362
### Analyzing traces
363363

364-
[`dial9-viewer`](/dial9-viewer) is an interactive trace viewer and S3 browser. Point it at a local directory or an S3 bucket to browse and visualize traces in the browser. [Here's a demo.](https://www.youtube.com/watch?v=zJOzU_6Mf7Q)
364+
[`dial9-viewer`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/dial9-viewer) is an interactive trace viewer and S3 browser. Point it at a local directory or an S3 bucket to browse and visualize traces in the browser. [Here's a demo.](https://www.youtube.com/watch?v=zJOzU_6Mf7Q)
365365

366366
```bash
367367
# Install
@@ -388,7 +388,7 @@ cargo run --example analyze_trace --features analysis -- /tmp/my_traces/trace.0.
388388
cargo run --example trace_to_jsonl --features analysis -- /tmp/my_traces/trace.0.bin.gz output.jsonl
389389
```
390390

391-
See [TRACE_ANALYSIS_GUIDE.md](/dial9-tokio-telemetry/TRACE_ANALYSIS_GUIDE.md) for a walkthrough of diagnosing scheduling delays and CPU hotspots from trace data.
391+
See [TRACE_ANALYSIS_GUIDE.md](https://github.com/dial9-rs/dial9-tokio-telemetry/blob/main/dial9-tokio-telemetry/TRACE_ANALYSIS_GUIDE.md) for a walkthrough of diagnosing scheduling delays and CPU hotspots from trace data.
392392

393393
## Features
394394

@@ -456,7 +456,7 @@ cargo run --example telemetry_rotating # manual setup + rotating writer conf
456456
cargo run --example multi_runtime # multiple runtimes, manual TelemetryCore
457457
```
458458

459-
The [`examples/metrics-service`](/examples/metrics-service) directory has a full Axum service with DynamoDB persistence, a load-generating client, and telemetry wired up end-to-end.
459+
The [`examples/metrics-service`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/examples/metrics-service) directory has a full Axum service with DynamoDB persistence, a load-generating client, and telemetry wired up end-to-end.
460460

461461
## Overhead
462462

@@ -476,11 +476,11 @@ Overhead: 3.2%
476476

477477
This repo is a Cargo workspace with five members:
478478

479-
- [`dial9-tokio-telemetry`](/dial9-tokio-telemetry) — the main crate
480-
- [`dial9-viewer`](/dial9-viewer) — CLI and web UI for browsing traces in S3 or on the local filesystem
481-
- [`dial9-macro`](/dial9-macro) — the `#[dial9_tokio_telemetry::main]` attribute macro
482-
- [`dial9-perf-self-profile`](/perf-self-profile) — minimal Linux `perf_event_open` wrapper for CPU profiling and scheduler events
483-
- [`examples/metrics-service`](/examples/metrics-service) — end-to-end example service
479+
- [`dial9-tokio-telemetry`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/dial9-tokio-telemetry) — the main crate
480+
- [`dial9-viewer`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/dial9-viewer) — CLI and web UI for browsing traces in S3 or on the local filesystem
481+
- [`dial9-macro`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/dial9-macro) — the `#[dial9_tokio_telemetry::main]` attribute macro
482+
- [`dial9-perf-self-profile`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/perf-self-profile) — minimal Linux `perf_event_open` wrapper for CPU profiling and scheduler events
483+
- [`examples/metrics-service`](https://github.com/dial9-rs/dial9-tokio-telemetry/tree/main/examples/metrics-service) — end-to-end example service
484484

485485
## Future work
486486

dial9-tokio-telemetry/benches/overhead_bench.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ fn run_bench(mode: &str, duration_secs: u64) -> BenchResult {
110110
let (server_rt, guard): (tokio::runtime::Runtime, Option<TelemetryGuard>) = match mode {
111111
"telemetry" => {
112112
let writer = RotatingWriter::single_file("/tmp/overhead_bench_trace.bin").unwrap();
113+
#[allow(unused_mut)]
113114
let mut tb = TracedRuntime::builder().with_task_tracking(true);
114115
#[cfg(target_os = "linux")]
115116
{
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
//! Generate a trace with 48 workers for testing the viewer with many lanes.
2+
//!
3+
//! Usage:
4+
//! cargo run --example many_workers
5+
//!
6+
//! Then open the trace in the viewer:
7+
//! cargo run -p dial9-viewer -- serve --local-dir .
8+
9+
use std::time::Duration;
10+
11+
use dial9_tokio_telemetry::config::{Dial9Config, Dial9ConfigBuilder};
12+
use dial9_tokio_telemetry::telemetry::TelemetryHandle;
13+
14+
fn my_config() -> Dial9Config {
15+
Dial9ConfigBuilder::new(
16+
"many_workers_trace.bin",
17+
64 * 1024 * 1024,
18+
256 * 1024 * 1024,
19+
)
20+
.with_tokio(|t| {
21+
t.worker_threads(48);
22+
})
23+
.with_runtime(|r| r.with_task_tracking(true))
24+
.build()
25+
}
26+
27+
#[dial9_tokio_telemetry::main(config = my_config)]
28+
async fn main() {
29+
println!("Running workload with 48 workers...");
30+
31+
let handle = TelemetryHandle::current();
32+
let tasks: Vec<_> = (0..500)
33+
.map(|i| {
34+
handle.spawn(async move {
35+
for _ in 0..5 {
36+
tokio::time::sleep(Duration::from_millis(5)).await;
37+
// Small CPU work to generate poll events
38+
let mut v = 0u64;
39+
for j in 0..50_000u64 {
40+
v = v.wrapping_add(j.wrapping_mul(j));
41+
}
42+
std::hint::black_box(v);
43+
tokio::task::yield_now().await;
44+
}
45+
if i % 100 == 0 {
46+
println!("Task {i} done");
47+
}
48+
})
49+
})
50+
.collect();
51+
52+
for task in tasks {
53+
let _ = task.await;
54+
}
55+
56+
println!("Trace written to many_workers_trace.*.bin");
57+
}

dial9-viewer/src/storage.rs

Lines changed: 2 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ impl StorageBackend for LocalBackend {
220220
let prefix2 = prefix.clone();
221221
tokio::task::spawn_blocking(move || {
222222
let mut objects = Vec::new();
223-
collect_files(&root, &root, &prefix2, &mut objects, 0, &mut 0)?;
223+
collect_files(&root, &root, &prefix2, &mut objects)?;
224224
objects.sort_by(|a, b| a.key.cmp(&b.key));
225225
Ok(objects)
226226
})
@@ -313,49 +313,18 @@ impl StorageBackend for LocalBackend {
313313
}
314314
}
315315

316-
/// Maximum directory depth to recurse into when listing local files.
317-
const MAX_COLLECT_DEPTH: u32 = 10;
318-
319-
/// Maximum number of files to return from a local directory listing.
320-
const MAX_COLLECT_FILES: usize = 50;
321-
322-
/// Maximum number of directory entries to visit (files + dirs) across the
323-
/// entire recursive walk. This bounds the number of syscalls (`canonicalize`,
324-
/// `metadata`) so a huge directory tree cannot hang the listing.
325-
const MAX_ENTRIES_VISITED: usize = 500;
326-
327-
/// Directory names to skip during recursive file collection.
328-
fn is_skipped_dir(name: &str) -> bool {
329-
name.starts_with('.') || matches!(name, "target" | "node_modules")
330-
}
331-
332316
fn collect_files(
333317
root: &Path,
334318
dir: &Path,
335319
prefix: &str,
336320
out: &mut Vec<ObjectInfo>,
337-
depth: u32,
338-
visited: &mut usize,
339321
) -> Result<(), StorageError> {
340-
if depth > MAX_COLLECT_DEPTH
341-
|| out.len() >= MAX_COLLECT_FILES
342-
|| *visited >= MAX_ENTRIES_VISITED
343-
{
344-
return Ok(());
345-
}
346322
let entries = match std::fs::read_dir(dir) {
347323
Ok(e) => e,
348324
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(()),
349-
Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
350-
return Err(StorageError::Other("permission denied".into()));
351-
}
352325
Err(e) => return Err(StorageError::Other(e.to_string())),
353326
};
354327
for entry in entries {
355-
*visited += 1;
356-
if out.len() >= MAX_COLLECT_FILES || *visited >= MAX_ENTRIES_VISITED {
357-
break;
358-
}
359328
let entry = entry.map_err(|e| StorageError::Other(e.to_string()))?;
360329
let path = entry.path();
361330
// Resolve symlinks and verify the target stays within root.
@@ -364,11 +333,7 @@ fn collect_files(
364333
_ => continue,
365334
};
366335
if canonical.is_dir() {
367-
let name = entry.file_name();
368-
let name = name.to_string_lossy();
369-
if !is_skipped_dir(&name) {
370-
collect_files(root, &canonical, prefix, out, depth + 1, visited)?;
371-
}
336+
collect_files(root, &canonical, prefix, out)?;
372337
} else if canonical.is_file() {
373338
let key = path
374339
.strip_prefix(root)
@@ -392,31 +357,3 @@ fn collect_files(
392357
}
393358
Ok(())
394359
}
395-
396-
#[cfg(test)]
397-
mod tests {
398-
use super::*;
399-
400-
#[test]
401-
fn collect_files_caps_entries_visited() {
402-
let dir = tempfile::tempdir().unwrap();
403-
// Create more files than MAX_ENTRIES_VISITED to prove we stop early.
404-
let n = MAX_ENTRIES_VISITED + 500;
405-
for i in 0..n {
406-
std::fs::write(dir.path().join(format!("file_{i:05}.bin")), b"x").unwrap();
407-
}
408-
let mut out = Vec::new();
409-
let mut visited = 0;
410-
collect_files(dir.path(), dir.path(), "", &mut out, 0, &mut visited).unwrap();
411-
// visited must be capped — we should NOT have iterated all n files.
412-
assert!(
413-
visited <= MAX_ENTRIES_VISITED,
414-
"visited {visited} entries, expected at most {MAX_ENTRIES_VISITED}"
415-
);
416-
assert!(
417-
out.len() <= MAX_COLLECT_FILES,
418-
"collected {} files, expected at most {MAX_COLLECT_FILES}",
419-
out.len()
420-
);
421-
}
422-
}

0 commit comments

Comments
 (0)