Update readme and allow tests to pass on macOS

rcoh · rcoh · commit 30e64cd9c8b1 · 2026-03-01T06:48:30.000-05:00
diff --git a/dial9-tokio-telemetry/Cargo.toml b/dial9-tokio-telemetry/Cargo.toml
@@ -25,6 +25,8 @@ criterion = "0.5"
 hdrhistogram = "7"
 proptest = "1"
 tempfile = "3"
+
+[target.'cfg(target_os = "linux")'.dev-dependencies]
 dial9-tokio-telemetry = { path = ".", features = ["cpu-profiling"] }
 
 [[bench]]
@@ -51,3 +53,11 @@ required-features = ["task-dump"]
 [[example]]
 name = "debug_timing"
 required-features = ["task-dump"]
+
+[[example]]
+name = "blocking_sleep"
+required-features = ["cpu-profiling"]
+
+[[example]]
+name = "cpu_profile_workload"
+required-features = ["cpu-profiling"]
diff --git a/dial9-tokio-telemetry/README.md b/dial9-tokio-telemetry/README.md
@@ -1,10 +1,6 @@
-# Tokio Telemtry
+# dial9-tokio-telemetry
 
-The `telemetry` module records lightweight runtime telemetry — poll start/end, worker park/unpark, and queue depth samples — into a compact binary trace format. Traces can be analyzed offline to find idle workers, long polls, and scheduling imbalances.
-
-### Quick Start
-
-The easiest way to get started is with `TracedRuntime`, which wires up all the hooks and background threads for you. Use a `RotatingWriter` to bound disk usage in production (see [`examples/telemetry_rotating.rs`](examples/telemetry_rotating.rs) for the full example):
+**Low-overhead runtime telemetry for Tokio.** Records poll timing, worker park/unpark, wake events, queue depths, and (on Linux) CPU profile samples into a compact binary trace format. Traces can be analyzed offline to find long polls, scheduling delays, idle workers, and CPU hotspots.
 
 ```rust
 use dial9_tokio_telemetry::telemetry::{RotatingWriter, TracedRuntime};
@@ -19,106 +15,190 @@ fn main() -> std::io::Result<()> {
     let mut builder = tokio::runtime::Builder::new_multi_thread();
     builder.worker_threads(4).enable_all();
 
-    // build_and_start() enables telemetry immediately
-    // use build() to start disabled, then call guard.enable() later
     let (runtime, _guard) = TracedRuntime::build_and_start(builder, Box::new(writer))?;
 
     runtime.block_on(async {
-        // ... your async code here ...
+        // your async code here
     });
 
-    // Dropping `runtime` then `_guard` performs a final flush.
     Ok(())
 }
 ```
 
-`TracedRuntime::build` returns a `TelemetryGuard` whose `handle()` method gives you a cheap, cloneable `TelemetryHandle` you can use to enable/disable recording at runtime.
+Events are 6–16 bytes on the wire, and a typical request generates ~20–35 bytes of trace data (a few poll events plus park/unpark). At 10k requests/sec that's well under 1 MB/s — `RotatingWriter` caps total disk usage so you can leave it running indefinitely. Typical CPU overhead is under 5%.
 
-### Writers
+> **Note:** dial9-tokio-telemetry is designed for always-on production use, but it's still early software. Measure overhead and validate behavior in your environment before deploying to production.
 
-| Writer | Use case |
-|--------|----------|
-| `RotatingWriter` | Production — automatically rotates and evicts old files to stay within a total size budget |
-| `SimpleBinaryWriter` | Quick experiments — writes a single trace file with no size management |
-| `NullWriter` | Benchmarking — measures hook overhead without any I/O |
+## Is there a demo?
+Yes, checkout this [quick walkthrough (YouTube)](https://www.youtube.com/watch?v=zJOzU_6Mf7Q)!
 
-**Future**: S3 writer for direct cloud storage, or use existing log shipping (CWAgent, Firelens, etc.) to push trace files.
+## Why dial9-tokio-telemetry?
 
-### Analyzing Traces
+Understanding how Tokio is actually running your application — which tasks are slow, why workers are idle, where scheduling delays come from — is hard to do from the outside. This crate records a continuous, low-overhead trace of runtime behavior.
 
-Use the included examples to inspect trace files:
+Compared to [tokio-console](https://github.com/tokio-rs/console), which is designed for live debugging, dial9-tokio-telemetry is designed for post-hoc analysis. Because traces are written to files with bounded disk usage, you can leave it running in production and come back later to deeply analyze what went wrong or why a specific request was slow. On Linux, traces include CPU profile samples and scheduler events, so you can see not just *that* a task was delayed but *what code* was running on the worker instead.
 
-```bash
-# Print a summary with per-worker stats and idle-worker detection
-cargo run --example analyze_trace -- /tmp/my_traces/trace.0.bin
+## What gets recorded automatically
 
-# Convert a binary trace to JSONL for ad-hoc analysis
-cargo run --example trace_to_jsonl -- /tmp/my_traces/trace.0.bin output.jsonl
+`TracedRuntime` installs hooks on the Tokio runtime builder. These fire for every task on the runtime with no code changes required:
+
+| Event | Fields |
+|-------|--------|
+| `PollStart` / `PollEnd` | timestamp, worker, task ID, spawn location, local queue depth |
+| `WorkerPark` / `WorkerUnpark` | timestamp, worker, local queue depth, thread CPU time, schedstat wait |
+| `QueueSample` | timestamp, global queue depth (sampled every 10 ms) |
+| `TaskSpawn` / `SpawnLocationDef` | task→spawn-location mapping (when `task_tracking` is enabled) |
+
+## Wake event tracking
+
+Wake events — which task woke which other task — are *not* captured automatically. Tokio's runtime hooks don't expose waker identity, so capturing this requires wrapping the future in `Traced<F>`, which installs a custom waker that records a `WakeEvent` before forwarding to the real waker.
+
+Use `handle.spawn()` instead of `tokio::spawn()`:
+
+```rust,no_run
+# use dial9_tokio_telemetry::telemetry::{RotatingWriter, TracedRuntime};
+# fn main() -> std::io::Result<()> {
+# let writer = RotatingWriter::new("/tmp/t.bin", 1024, 4096)?;
+# let builder = tokio::runtime::Builder::new_multi_thread();
+let (runtime, guard) = TracedRuntime::build_and_start(builder, Box::new(writer))?;
+let handle = guard.handle();
 
-# Open the interactive HTML viewer
-open trace_viewer.html
-# Then drag-and-drop a .bin file to visualize the timeline
+runtime.block_on(async {
+    // wake events captured — uses Traced<F> wrapper
+    handle.spawn(async { /* ... */ });
+
+    // wake events NOT captured — still gets poll/park/queue telemetry
+    tokio::spawn(async { /* ... */ });
+});
+# Ok(())
+# }
 ```
 
-**Future**: S3 writer for direct cloud storage, or use existing log shipping (CWAgent, Firelens, etc.) to push trace files.
+For frameworks like Axum where you don't control the spawn call, you need to wrap the accept loop. See [`examples/metrics-service/src/axum_traced.rs`](../examples/metrics-service/src/axum_traced.rs) for a working example that wraps both the accept loop and per-connection futures.
 
-## Examples
+## Platform support
+
+Core telemetry (poll timing, park/unpark, queue depth, wake events) works on all platforms.
+
+On Linux, you get additional data for free:
+- **Thread CPU time** in park/unpark events via `CLOCK_THREAD_CPUTIME_ID` (vDSO, ~20–40 ns)
+- **Scheduler wait time** via `/proc/self/task/<tid>/schedstat` — shows how long the OS kept your thread off-CPU
+
+On non-Linux platforms these fields are zero.
+
+### CPU profiling (Linux only)
+
+With the `cpu-profiling` feature, you can enable `perf_event_open`-based CPU sampling and scheduler event capture. This records stack traces attributed to specific worker threads, so you can see *what code* was running during a scheduling delay.
+
+```rust,no_run
+# #[cfg(feature = "cpu-profiling")]
+# fn main() -> std::io::Result<()> {
+# use dial9_tokio_telemetry::telemetry::{RotatingWriter, TracedRuntime};
+use dial9_tokio_telemetry::telemetry::{CpuProfilingConfig, SchedEventConfig};
+
+# let writer = RotatingWriter::new("/tmp/t.bin", 1024, 4096)?;
+# let builder = tokio::runtime::Builder::new_multi_thread();
+let (runtime, guard) = TracedRuntime::builder()
+    .with_task_tracking(true)
+    .with_cpu_profiling(CpuProfilingConfig::default())
+    .with_sched_events(SchedEventConfig { include_kernel: true })
+    .with_inline_callframe_symbols(true)
+    .build(builder, Box::new(writer))?;
+# Ok(())
+# }
+# #[cfg(not(feature = "cpu-profiling"))]
+# fn main() {}
+```
+
+This pulls in [`dial9-perf-self-profile`](perf-self-profile) for `perf_event_open` access. It records `CpuSample` events with full callchains and `CallframeDef` / `ThreadNameDef` metadata for offline symbolization.
+
+## Getting started
+
+`TracedRuntime::build` returns a `(Runtime, TelemetryGuard)`. The guard owns the flush thread and provides a `TelemetryHandle` for enabling/disabling recording at runtime:
+
+```rust,no_run
+# use dial9_tokio_telemetry::telemetry::{RotatingWriter, TracedRuntime};
+# fn main() -> std::io::Result<()> {
+# let writer = RotatingWriter::new("/tmp/t.bin", 1024, 4096)?;
+# let builder = tokio::runtime::Builder::new_multi_thread();
+let (runtime, guard) = TracedRuntime::builder()
+    .with_task_tracking(true)
+    .build(builder, Box::new(writer))?;
+
+// start disabled, enable later
+guard.enable();
+
+// TelemetryHandle is Clone + Send — pass it around
+let handle = guard.handle();
+handle.disable();
+# Ok(())
+# }
+```
+
+### Writers
+
+`RotatingWriter` is what you want for production — it rotates files and evicts old ones to stay within a total size budget. `SimpleBinaryWriter` writes a single file with no size management, useful for quick experiments. `NullWriter` measures hook overhead without doing any I/O.
+
+### Analyzing traces
 
-Run examples with:
 ```bash
-# Long-poll detection
-cargo run --example long_sleep
-cargo run --example cancelled_task
-cargo run --example completing_task
-
-# Telemetry
-cargo run --example telemetry_demo
-cargo run --example telemetry_rotating
+# per-worker stats, wake→poll delays, idle worker detection
+cargo run --example analyze_trace -- /tmp/my_traces/trace.0.bin
+
+# convert to JSONL for ad-hoc scripting
+cargo run --example trace_to_jsonl -- /tmp/my_traces/trace.0.bin output.jsonl
 ```
 
-## Benchmarks
+There's also an interactive HTML trace viewer — open `trace_viewer/index.html` and drag in a `.bin` file. [Here's a demo.](https://www.youtube.com/watch?v=zJOzU_6Mf7Q)
+
+See [TRACE_ANALYSIS_GUIDE.md](TRACE_ANALYSIS_GUIDE.md) for a walkthrough of diagnosing scheduling delays and CPU hotspots from trace data.
+
+## Features
+
+- **`cpu-profiling`** — Linux only. Enables `perf_event_open`-based CPU sampling and scheduler event capture via `dial9-perf-self-profile`.
+- **`task-dump`** — Enables Tokio's `taskdump` feature for async stack traces. Required for the `long_sleep`, `completing_task`, `cancelled_task`, and `debug_timing` examples.
+
+## Examples
 
-Run benchmarks with:
 ```bash
-cargo bench
+cargo run --example telemetry_rotating   # rotating writer demo
+cargo run --example simple_workload      # basic instrumented workload
+cargo run --example realistic_workload   # mixed CPU/IO workload
+cargo run --example long_workload        # longer run for trace analysis
 ```
 
-### Overhead Comparison
+The [`examples/metrics-service`](../examples/metrics-service) directory has a full Axum service with DynamoDB persistence, a load-generating client, and telemetry wired up end-to-end.
+
+## Overhead
 
-Compare baseline vs telemetry overhead:
 ```bash
 ./scripts/compare_overhead.sh [duration_secs]
 ```
 
-This runs the `overhead_bench` in both modes and validates:
-- Telemetry overhead is acceptable (< 10%)
-- Trace bytes per request (20-35 bytes) - tracks total trace data generated per client request
-- Bytes per trace event (6-12 bytes) - validates binary format efficiency
+This runs the `overhead_bench` binary with and without telemetry and reports the difference. Typical output:
 
-Example output:
-```
-=== Comparison ===
+```text
 Baseline:   286794 req/s, p50=174.1µs, p99=280.6µs
 Telemetry:  277626 req/s, p50=180.2µs, p99=289.3µs
 Overhead:   3.2%
-
-=== Trace Efficiency ===
-Trace bytes/request:    25.56
-Bytes/trace event:      6.39
-Client requests/sec:    277682
 ```
 
-## Configuration
+## Workspace
+
+This repo is a Cargo workspace with three members:
 
-The system uses `.cargo/config.toml` to enable the `tokio_unstable` flag required for task dumps.
+- [`dial9-tokio-telemetry`](dial9-tokio-telemetry) — the main crate
+- [`dial9-perf-self-profile`](perf-self-profile) — minimal Linux `perf_event_open` wrapper for CPU profiling and scheduler events
+- [`examples/metrics-service`](examples/metrics-service) — end-to-end example service
 
-## Dependencies
+## Future work
 
-- `tokio` (with `taskdump` feature)
-- `arc-swap` (for lock-free sentinel updates)
-- `pin-project-lite` (for proper pinning in the future wrapper)
-- `smallvec` (for efficient small vector storage)
+- **S3 writer** — upload traces directly to S3 instead of relying on log shipping
+- **Parquet output** — write traces as Parquet for efficient querying with Athena, DuckDB, etc.
+- **Tokio task dumps** — capture async stack traces of all in-flight tasks
+- **Retroactive sampling** — trace data lives in a ring buffer; when your application detects anomalous behavior, it triggers persistence of the last N seconds of data rather than recording everything continuously
+- **Out-of-process symbolication** — resolve CPU profile stack traces in a background process to avoid adding latency or memory overhead to the application
 
-## Useful links
+## License
 
-- [Code Browser](https://code.amazon.com/packages/RustProfilingExperiments/)
+This project is licensed under the Apache-2.0 License.
diff --git a/dial9-tokio-telemetry/benches/overhead_bench.rs b/dial9-tokio-telemetry/benches/overhead_bench.rs
@@ -14,9 +14,11 @@
 //! Duration defaults to 30 seconds. A 3-second warmup precedes measurement.
 
 use dial9_tokio_telemetry::telemetry::{
-    CpuProfilingConfig, NullWriter, SimpleBinaryWriter, TelemetryGuard, TelemetryHandle,
+    NullWriter, SimpleBinaryWriter, TelemetryGuard, TelemetryHandle,
     TracedRuntime,
 };
+#[cfg(target_os = "linux")]
+use dial9_tokio_telemetry::telemetry::CpuProfilingConfig;
 use hdrhistogram::Histogram;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
@@ -173,10 +175,15 @@ fn main() {
         "telemetry" => {
             let writer =
                 Box::new(SimpleBinaryWriter::new("/tmp/overhead_bench_trace.bin").unwrap());
-            let (rt, g) = TracedRuntime::builder()
-                .with_task_tracking(true)
-                .with_cpu_profiling(CpuProfilingConfig::default())
-                .with_inline_callframe_symbols(true)
+            let mut tb = TracedRuntime::builder()
+                .with_task_tracking(true);
+            #[cfg(target_os = "linux")]
+            {
+                tb = tb
+                    .with_cpu_profiling(CpuProfilingConfig::default())
+                    .with_inline_callframe_symbols(true);
+            }
+            let (rt, g) = tb
                 .build_and_start(builder, writer)
                 .unwrap();
             (rt, Some(g))
diff --git a/dial9-tokio-telemetry/src/lib.rs b/dial9-tokio-telemetry/src/lib.rs
@@ -1,3 +1,5 @@
+#![doc = include_str!("../README.md")]
+
 pub mod telemetry;
 pub mod traced;
 
diff --git a/dial9-tokio-telemetry/tests/js_parser.rs b/dial9-tokio-telemetry/tests/js_parser.rs
@@ -1,6 +1,6 @@
 //! Integration test: verify JS trace parser matches Rust parser
 
-use dial9_tokio_telemetry::telemetry::{CpuProfilingConfig, SimpleBinaryWriter, TracedRuntime};
+use dial9_tokio_telemetry::telemetry::{SimpleBinaryWriter, TracedRuntime};
 use std::process::Command;
 use tempfile::TempDir;
 
@@ -27,18 +27,21 @@ fn test_js_parser_matches_rust() {
     let trace_path = temp_dir.path().join("test_trace.bin");
     let jsonl_path = temp_dir.path().join("expected.jsonl");
 
-    // Generate a trace with CPU profiling
+    // Generate a trace — enable CPU profiling on Linux where it's available
     {
         let mut builder = tokio::runtime::Builder::new_multi_thread();
         builder.worker_threads(2).enable_all();
 
         let writer = Box::new(SimpleBinaryWriter::new(&trace_path).unwrap());
-        let (runtime, _guard) = TracedRuntime::builder()
-            .with_task_tracking(true)
-            .with_cpu_profiling(CpuProfilingConfig::default())
-            .with_inline_callframe_symbols(true)
-            .build_and_start(builder, writer)
-            .unwrap();
+        #[allow(unused_mut)]
+        let mut tb = TracedRuntime::builder().with_task_tracking(true);
+        #[cfg(feature = "cpu-profiling")]
+        {
+            tb = tb
+                .with_cpu_profiling(dial9_tokio_telemetry::telemetry::CpuProfilingConfig::default())
+                .with_inline_callframe_symbols(true);
+        }
+        let (runtime, _guard) = tb.build_and_start(builder, writer).unwrap();
 
         runtime.block_on(async {
             let mut tasks = vec![];
diff --git a/examples/metrics-service/README.md b/examples/metrics-service/README.md

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#![doc = include_str!("../README.md")]`
	`2`	`+`
`1`	`3`	`pub mod telemetry;`
`2`	`4`	`pub mod traced;`
`3`	`5`