Skip to content

Commit ad92d5a

Browse files
committed
fix: eliminate task dump busy loop and move dl_iterate_phdr off hot path
Two fixes for task dump overhead: 1. Add `just_captured` flag to break the capture→wake→poll→capture busy loop. The `trace_with` re-poll registers the real waker which fires immediately, causing infinite re-scheduling. Now we skip capture on the spurious wake, limiting overhead to one extra poll per capture. 2. Split stack walking into two phases: - Capture (hot path): collect raw IPs via `_Unwind_GetIP` only — no `_Unwind_FindEnclosingFunction`, no `dl_iterate_phdr`, no locks. - Emit (rare, Poisson-sampled): trim frames to root/leaf boundaries using `_Unwind_FindEnclosingFunction`. This moves the global `dl_iterate_phdr` mutex off the per-yield capture path onto the infrequent emit path. Also adds `--no-task-dumps` flag to metrics-service for A/B comparison. Before: 14K scheduling delays >1ms, worker pegged at 99.7%, pthread_mutex_lock 14.3% of CPU samples, POST p99 5.5ms After: 373 scheduling delays >1ms, workers at 16%, mutex contention gone from hotspots, POST p99 2.5ms Baseline (no dumps): 262 delays >1ms, 12% util, POST p99 1.85ms
1 parent 4c0025a commit ad92d5a

4 files changed

Lines changed: 125 additions & 67 deletions

File tree

dial9-tokio-telemetry/src/task_dumped.rs

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,12 @@ pin_project! {
100100
sample_mean_ns: u64,
101101
// Per-task PRNG for drawing exponential gaps.
102102
rng: SplitMix64,
103+
// Set after `capture()` re-polls the inner future with the real waker.
104+
// The re-poll causes a spurious immediate wake; this flag suppresses
105+
// the next capture to break the busy loop (capture → wake → poll →
106+
// capture → …). Cleared on the next poll so subsequent real wakes
107+
// proceed normally.
108+
just_captured: bool,
103109
}
104110
}
105111

@@ -126,6 +132,7 @@ impl<F> TaskDumped<F> {
126132
next_sample_ns,
127133
sample_mean_ns,
128134
rng,
135+
just_captured: false,
129136
}
130137
}
131138
}
@@ -172,9 +179,17 @@ impl<F: Future> Future for TaskDumped<F> {
172179
*this.pending_capture_ts = None;
173180
}
174181
Poll::Pending => {
175-
this.frames.capture(this.inner.as_mut(), cx);
176-
let poll_end = crate::telemetry::recorder::poll_start_ts_or_now();
177-
*this.pending_capture_ts = NonZeroU64::new(poll_end);
182+
// Skip capture if this poll was triggered by the spurious wake
183+
// from the previous capture's re-poll. This breaks the busy
184+
// loop: capture → wake → poll → capture → …
185+
if *this.just_captured {
186+
*this.just_captured = false;
187+
} else {
188+
this.frames.capture(this.inner.as_mut(), cx);
189+
*this.just_captured = true;
190+
let poll_end = crate::telemetry::recorder::poll_start_ts_or_now();
191+
*this.pending_capture_ts = NonZeroU64::new(poll_end);
192+
}
178193
}
179194
}
180195
result
@@ -207,16 +222,42 @@ impl FrameBuf {
207222
}
208223

209224
/// Emit one `TaskDumpEvent` per recorded callchain, then clear.
225+
/// Trimming via `_Unwind_FindEnclosingFunction` happens here (emit path)
226+
/// rather than during capture, keeping the hot path lock-free.
210227
fn emit(&mut self, shared: &SharedState, task_id: TaskId, capture_ts: u64) {
211228
shared.if_enabled(|buf| {
212-
for i in 0..self.offsets.len() {
213-
let start = self.offsets[i];
214-
let end = self.offsets.get(i + 1).copied().unwrap_or(self.ips.len());
215-
buf.record_encodable_event(&TaskDumpData {
216-
timestamp_ns: capture_ts,
217-
task_id,
218-
callchain: &self.ips[start..end],
219-
});
229+
// Each chain is stored as 3 entries in offsets:
230+
// [ip_start_index, root_addr_as_usize, leaf_addr_as_usize]
231+
for chunk in self.offsets.chunks_exact(3) {
232+
let ip_start = chunk[0];
233+
let root_addr = chunk[1];
234+
let leaf_addr = chunk[2];
235+
236+
// Determine the end of this chain's IPs: either the start of
237+
// the next chain or the end of the ips buffer.
238+
let ip_end = self
239+
.offsets
240+
.chunks_exact(3)
241+
.map(|c| c[0])
242+
.find(|&s| s > ip_start)
243+
.unwrap_or(self.ips.len());
244+
245+
let raw = &self.ips[ip_start..ip_end];
246+
let root = if root_addr == 0 {
247+
None
248+
} else {
249+
Some(root_addr as *const core::ffi::c_void)
250+
};
251+
let leaf = leaf_addr as *const core::ffi::c_void;
252+
let chain = crate::unwind::trim_frames(raw, root, leaf);
253+
254+
if !chain.is_empty() {
255+
buf.record_encodable_event(&TaskDumpData {
256+
timestamp_ns: capture_ts,
257+
task_id,
258+
callchain: chain,
259+
});
260+
}
220261
}
221262
});
222263
self.clear();
@@ -241,23 +282,18 @@ impl FrameBuf {
241282
},
242283
|meta| {
243284
offsets.push(ips.len());
244-
capture_frames(ips, meta.root_addr, meta.trace_leaf_addr);
285+
// Hot path: collect raw IPs only — no _Unwind_FindEnclosingFunction,
286+
// no dl_iterate_phdr, no global locks. Trimming to root/leaf
287+
// boundaries happens later in emit().
288+
crate::unwind::collect_frames_raw(ips);
289+
// Stash the root/leaf addresses so we can trim at emit time.
290+
offsets.push(meta.root_addr.map_or(0, |p| p as usize));
291+
offsets.push(meta.trace_leaf_addr as usize);
245292
},
246293
);
247294
}
248295
}
249296

250-
/// Walk the stack, collecting instruction pointers between `leaf_addr` and
251-
/// `root_addr`. Calls `_Unwind_Backtrace` directly via [`crate::unwind`],
252-
/// bypassing the `backtrace` crate's process-wide mutex.
253-
fn capture_frames(
254-
ips: &mut Vec<u64>,
255-
root_addr: Option<*const core::ffi::c_void>,
256-
leaf_addr: *const core::ffi::c_void,
257-
) {
258-
crate::unwind::collect_frames(ips, root_addr, leaf_addr);
259-
}
260-
261297
/// Borrowed-callchain view of a task-dump event that implements [`Encodable`]
262298
/// by interning its ips into the batch's stack pool.
263299
pub(crate) struct TaskDumpData<'a> {

dial9-tokio-telemetry/src/unwind.rs

Lines changed: 53 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,21 @@
22
//!
33
//! Bypasses the `backtrace` crate's global mutex by calling `_Unwind_Backtrace`
44
//! directly. Adapted from backtrace-0.3.76/src/backtrace/libunwind.rs (Apache-2.0/MIT).
5+
//!
6+
//! # Two-phase design
7+
//!
8+
//! Capture and trimming are separated to keep the hot path lock-free:
9+
//!
10+
//! - [`collect_frames_raw`]: called on every yield. Collects all IPs using only
11+
//! `_Unwind_GetIP` — no global locks.
12+
//! - [`trim_frames`]: called only at emit time (rare, Poisson-sampled). Uses
13+
//! `_Unwind_FindEnclosingFunction` to locate the root/leaf boundaries and
14+
//! trims the collected IPs. This calls `dl_iterate_phdr` internally but only
15+
//! runs on the infrequent emit path.
516
617
use core::ffi::c_void;
718

8-
#[allow(non_camel_case_types)]
19+
#[allow(non_camel_case_types, dead_code)]
920
#[repr(C)]
1021
enum UnwindReasonCode {
1122
NoReason = 0,
@@ -23,51 +34,55 @@ unsafe extern "C" {
2334
fn _Unwind_FindEnclosingFunction(pc: *mut c_void) -> *mut c_void;
2435
}
2536

26-
struct CallbackData<'a> {
37+
// ─── Raw collection (hot path, no locks) ────────────────────────────────────
38+
39+
struct RawCallbackData<'a> {
2740
frame_ips: &'a mut Vec<u64>,
28-
above_leaf: bool,
29-
root_addr: Option<*const c_void>,
30-
leaf_addr: *const c_void,
3141
}
3242

33-
extern "C" fn trace_fn(ctx: *mut UnwindContext, arg: *mut c_void) -> UnwindReasonCode {
34-
let data = unsafe { &mut *arg.cast::<CallbackData<'_>>() };
35-
let ip = unsafe { _Unwind_GetIP(ctx) } as *mut c_void;
36-
let symbol_address = unsafe { _Unwind_FindEnclosingFunction(ip) };
37-
38-
let below_root = data
39-
.root_addr
40-
.is_none_or(|root| !std::ptr::eq(symbol_address, root));
41-
42-
if data.above_leaf && below_root {
43-
data.frame_ips.push(ip as u64);
44-
}
45-
46-
if std::ptr::eq(symbol_address, data.leaf_addr) {
47-
data.above_leaf = true;
48-
}
43+
extern "C" fn raw_trace_fn(ctx: *mut UnwindContext, arg: *mut c_void) -> UnwindReasonCode {
44+
let data = unsafe { &mut *arg.cast::<RawCallbackData<'_>>() };
45+
let ip = unsafe { _Unwind_GetIP(ctx) } as u64;
46+
data.frame_ips.push(ip);
47+
UnwindReasonCode::NoReason
48+
}
4949

50-
if below_root {
51-
UnwindReasonCode::NoReason
52-
} else {
53-
UnwindReasonCode::Failure
50+
/// Collect all instruction pointers on the current call stack.
51+
/// Uses only `_Unwind_GetIP` — no `dl_iterate_phdr`, no global locks.
52+
pub(crate) fn collect_frames_raw(frame_ips: &mut Vec<u64>) {
53+
let mut data = RawCallbackData { frame_ips };
54+
unsafe {
55+
_Unwind_Backtrace(raw_trace_fn, (&raw mut data).cast());
5456
}
5557
}
5658

57-
/// Walk the call stack, collecting instruction pointers between `leaf_addr` and
58-
/// `root_addr`. This calls `_Unwind_Backtrace` directly without any global lock.
59-
pub(crate) fn collect_frames(
60-
frame_ips: &mut Vec<u64>,
59+
// ─── Trimming (emit path, uses FindEnclosingFunction) ───────────────────────
60+
61+
/// Trim a raw IP list to only the frames between `leaf_addr` and `root_addr`.
62+
/// Calls `_Unwind_FindEnclosingFunction` which takes `dl_iterate_phdr`'s lock,
63+
/// but this only runs on the infrequent emit path.
64+
pub(crate) fn trim_frames(
65+
frame_ips: &[u64],
6166
root_addr: Option<*const c_void>,
6267
leaf_addr: *const c_void,
63-
) {
64-
let mut data = CallbackData {
65-
frame_ips,
66-
above_leaf: false,
67-
root_addr,
68-
leaf_addr,
69-
};
70-
unsafe {
71-
_Unwind_Backtrace(trace_fn, (&raw mut data).cast());
68+
) -> &[u64] {
69+
// Find the first frame inside the leaf function (start of interesting range).
70+
let leaf_start = frame_ips.iter().position(|&ip| {
71+
let sym = unsafe { _Unwind_FindEnclosingFunction(ip as *mut c_void) };
72+
std::ptr::eq(sym, leaf_addr)
73+
});
74+
75+
// Find the first frame inside the root function (end of interesting range).
76+
let root_end = root_addr.and_then(|root| {
77+
frame_ips.iter().position(|&ip| {
78+
let sym = unsafe { _Unwind_FindEnclosingFunction(ip as *mut c_void) };
79+
std::ptr::eq(sym, root)
80+
})
81+
});
82+
83+
match (leaf_start, root_end) {
84+
(Some(start), Some(end)) if start + 1 < end => &frame_ips[start + 1..end],
85+
(Some(start), None) if start + 1 < frame_ips.len() => &frame_ips[start + 1..],
86+
_ => frame_ips,
7287
}
7388
}

dial9-viewer/ui/demo-trace.bin

-1.31 MB
Binary file not shown.

examples/metrics-service/src/main.rs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ struct Args {
4848

4949
#[arg(
5050
long,
51-
default_value = "100000000",
51+
default_value = "100000000", // 100 MB
5252
help = "Max trace file size in bytes"
5353
)]
5454
trace_max_file_size: u64,
@@ -77,6 +77,9 @@ struct Args {
7777

7878
#[arg(long, help = "AWS region for S3 uploads (defaults to SDK default)")]
7979
s3_region: Option<String>,
80+
81+
#[arg(long, help = "Disable task dump capture")]
82+
no_task_dumps: bool,
8083
}
8184

8285
#[derive(Clone)]
@@ -147,8 +150,8 @@ fn main() -> std::io::Result<()> {
147150
if args.demo {
148151
args.run_duration = 4;
149152
args.worker_threads = 2;
150-
args.trace_max_file_size = 20_000_000;
151-
args.trace_max_total_size = 20_000_000;
153+
args.trace_max_file_size = 100_000_000;
154+
args.trace_max_total_size = 100_000_000;
152155
}
153156

154157
let writer = RotatingWriter::builder()
@@ -169,12 +172,16 @@ fn main() -> std::io::Result<()> {
169172
builder.worker_threads(args.worker_threads).enable_all();
170173
let traced_builder = TracedRuntime::builder()
171174
.with_trace_path(&args.trace_path)
172-
.with_task_tracking(true)
173-
.with_task_dumps(
175+
.with_task_tracking(true);
176+
let traced_builder = if args.no_task_dumps {
177+
traced_builder
178+
} else {
179+
traced_builder.with_task_dumps(
174180
TaskDumpConfig::builder()
175181
.idle_threshold(Duration::from_millis(5))
176182
.build(),
177-
);
183+
)
184+
};
178185
#[cfg(target_os = "linux")]
179186
let traced_builder = traced_builder
180187
.with_cpu_profiling(CpuProfilingConfig::default())

0 commit comments

Comments
 (0)