@@ -10,63 +10,51 @@ use std::collections::{HashMap, HashSet};
1010use std:: panic:: Location ;
1111use std:: sync:: atomic:: { AtomicBool , Ordering } ;
1212use std:: sync:: { Arc , Mutex } ;
13- use std:: thread:: ThreadId ;
1413use std:: time:: { Duration , Instant } ;
1514use tokio:: runtime:: { Handle , RuntimeMetrics } ;
1615
16+ /// Sentinel value for events from non-worker threads
17+ const UNKNOWN_WORKER : usize = 255 ;
18+
1719thread_local ! {
1820 /// Cached tokio worker index for this thread. `None` means not yet resolved.
21+ /// Once resolved, the worker ID is stable for the lifetime of the thread—a thread
22+ /// won't become a *different* worker, though it may stop being a worker entirely.
1923 static WORKER_ID : Cell <Option <usize >> = const { Cell :: new( None ) } ;
2024 /// schedstat wait_time_ns captured at park time, used to compute delta on unpark.
2125 static PARKED_SCHED_WAIT : Cell <u64 > = const { Cell :: new( 0 ) } ;
2226}
2327
24- /// Build a ThreadId → tokio worker index map from RuntimeMetrics.
25- fn build_worker_map ( metrics : & RuntimeMetrics ) -> HashMap < ThreadId , usize > {
26- let mut map = HashMap :: new ( ) ;
27- for i in 0 ..metrics. num_workers ( ) {
28- if let Some ( tid) = metrics. worker_thread_id ( i) {
29- map. insert ( tid, i) ;
30- }
31- }
32- map
33- }
34-
3528/// Resolve the current thread's tokio worker index, caching in TLS.
36- /// Falls back to 0 if the map isn't populated yet.
37- fn resolve_worker_id ( worker_map : & ArcSwap < HashMap < ThreadId , usize > > ) -> usize {
38- // TODO: should return Option<usize> instead
29+ /// Returns None if the thread is not a tokio worker.
30+ ///
31+ /// The result is cached permanently in TLS because a thread's worker identity
32+ /// is stable: it won't become a different worker, it can only stop being one.
33+ fn resolve_worker_id ( metrics : & ArcSwap < Option < RuntimeMetrics > > ) -> Option < usize > {
3934 WORKER_ID . with ( |cell| {
4035 if let Some ( id) = cell. get ( ) {
41- return id ;
36+ return Some ( id ) ;
4237 }
4338 let tid = std:: thread:: current ( ) . id ( ) ;
44- let map = worker_map. load ( ) ;
45- let id = map. get ( & tid) . copied ( ) . unwrap_or ( 0 ) ;
46- if id != 0 || map. contains_key ( & tid) {
47- cell. set ( Some ( id) ) ;
39+ if let Some ( ref m) = * * metrics. load ( ) {
40+ for i in 0 ..m. num_workers ( ) {
41+ if m. worker_thread_id ( i) == Some ( tid) {
42+ cell. set ( Some ( i) ) ;
43+ return Some ( i) ;
44+ }
45+ }
4846 }
49- id
47+ None
5048 } )
5149}
5250
53- /// Invalidate the cached worker ID so it's re-resolved on next event.
54- fn invalidate_worker_id ( ) {
55- WORKER_ID . with ( |cell| cell. set ( None ) ) ;
56- }
57-
5851/// Shared state accessed lock-free by callbacks on the hot path.
5952/// No spawn location tracking here — all interning happens in the flush thread.
6053struct SharedState {
6154 enabled : AtomicBool ,
6255 collector : CentralCollector ,
6356 start_time : Instant ,
6457 metrics : ArcSwap < Option < RuntimeMetrics > > ,
65- /// ThreadId → tokio worker index, rebuilt every flush cycle.
66- /// Uses ArcSwap for lock-free reads on hot path (cached in TLS).
67- /// Must rebuild periodically because worker threads can restart with new ThreadIds.
68- /// Clone cost is negligible: ~100ns for typical instances, max ~1µs on very large instances (100s of workers), every 250ms.
69- worker_map : ArcSwap < HashMap < ThreadId , usize > > ,
7058}
7159
7260impl SharedState {
@@ -76,7 +64,6 @@ impl SharedState {
7664 collector : CentralCollector :: new ( ) ,
7765 start_time : Instant :: now ( ) ,
7866 metrics : ArcSwap :: from_pointee ( None ) ,
79- worker_map : ArcSwap :: from_pointee ( HashMap :: new ( ) ) ,
8067 }
8168 }
8269
@@ -91,64 +78,66 @@ impl SharedState {
9178 let should_flush = buf. should_flush ( ) || matches ! ( event, RawEvent :: WorkerPark { .. } ) ;
9279 if should_flush {
9380 self . collector . accept_flush ( buf. flush ( ) ) ;
94- invalidate_worker_id ( ) ;
9581 }
9682 } ) ;
9783 }
9884
9985 fn make_poll_start ( & self , location : & ' static Location < ' static > , task_id : TaskId ) -> RawEvent {
100- let worker_id = resolve_worker_id ( & self . worker_map ) ;
86+ let worker_id = resolve_worker_id ( & self . metrics ) ;
10187 let metrics_guard = self . metrics . load ( ) ;
102- let worker_local_queue_depth = if let Some ( ref metrics) = * * metrics_guard {
103- metrics. worker_local_queue_depth ( worker_id)
104- } else {
105- 0
106- } ;
88+ let worker_local_queue_depth =
89+ if let ( Some ( worker_id) , Some ( metrics) ) = ( worker_id, & * * metrics_guard) {
90+ metrics. worker_local_queue_depth ( worker_id)
91+ } else {
92+ 0
93+ } ;
10794 RawEvent :: PollStart {
10895 timestamp_nanos : self . start_time . elapsed ( ) . as_nanos ( ) as u64 ,
109- worker_id,
96+ worker_id : worker_id . unwrap_or ( UNKNOWN_WORKER ) ,
11097 worker_local_queue_depth,
11198 task_id,
11299 location,
113100 }
114101 }
115102
116103 fn make_poll_end ( & self ) -> RawEvent {
117- let worker_id = resolve_worker_id ( & self . worker_map ) ;
104+ let worker_id = resolve_worker_id ( & self . metrics ) ;
118105 RawEvent :: PollEnd {
119106 timestamp_nanos : self . start_time . elapsed ( ) . as_nanos ( ) as u64 ,
120- worker_id,
107+ worker_id : worker_id . unwrap_or ( UNKNOWN_WORKER ) ,
121108 }
122109 }
123110
124111 fn make_worker_park ( & self ) -> RawEvent {
125- let worker_id = resolve_worker_id ( & self . worker_map ) ;
112+ let worker_id = resolve_worker_id ( & self . metrics ) ;
126113 let metrics_guard = self . metrics . load ( ) ;
127- let worker_local_queue_depth = if let Some ( ref metrics) = * * metrics_guard {
128- metrics. worker_local_queue_depth ( worker_id)
129- } else {
130- 0
131- } ;
114+ let worker_local_queue_depth =
115+ if let ( Some ( worker_id) , Some ( metrics) ) = ( worker_id, & * * metrics_guard) {
116+ metrics. worker_local_queue_depth ( worker_id)
117+ } else {
118+ 0
119+ } ;
132120 let cpu_time_nanos = crate :: telemetry:: events:: thread_cpu_time_nanos ( ) ;
133121 if let Ok ( ss) = SchedStat :: read_current ( ) {
134122 PARKED_SCHED_WAIT . with ( |c| c. set ( ss. wait_time_ns ) ) ;
135123 }
136124 RawEvent :: WorkerPark {
137125 timestamp_nanos : self . start_time . elapsed ( ) . as_nanos ( ) as u64 ,
138- worker_id,
126+ worker_id : worker_id . unwrap_or ( UNKNOWN_WORKER ) ,
139127 worker_local_queue_depth,
140128 cpu_time_nanos,
141129 }
142130 }
143131
144132 fn make_worker_unpark ( & self ) -> RawEvent {
145- let worker_id = resolve_worker_id ( & self . worker_map ) ;
133+ let worker_id = resolve_worker_id ( & self . metrics ) ;
146134 let metrics_guard = self . metrics . load ( ) ;
147- let worker_local_queue_depth = if let Some ( ref metrics) = * * metrics_guard {
148- metrics. worker_local_queue_depth ( worker_id)
149- } else {
150- 0
151- } ;
135+ let worker_local_queue_depth =
136+ if let ( Some ( worker_id) , Some ( metrics) ) = ( worker_id, & * * metrics_guard) {
137+ metrics. worker_local_queue_depth ( worker_id)
138+ } else {
139+ 0
140+ } ;
152141 let cpu_time_nanos = crate :: telemetry:: events:: thread_cpu_time_nanos ( ) ;
153142 let sched_wait_delta_nanos = if let Ok ( ss) = SchedStat :: read_current ( ) {
154143 let prev = PARKED_SCHED_WAIT . with ( |c| c. get ( ) ) ;
@@ -158,7 +147,7 @@ impl SharedState {
158147 } ;
159148 RawEvent :: WorkerUnpark {
160149 timestamp_nanos : self . start_time . elapsed ( ) . as_nanos ( ) as u64 ,
161- worker_id,
150+ worker_id : worker_id . unwrap_or ( UNKNOWN_WORKER ) ,
162151 worker_local_queue_depth,
163152 cpu_time_nanos,
164153 sched_wait_delta_nanos,
@@ -320,13 +309,6 @@ impl TelemetryRecorder {
320309 }
321310
322311 fn flush ( & mut self ) {
323- let metrics_guard = self . shared . metrics . load ( ) ;
324- if let Some ( ref metrics) = * * metrics_guard {
325- self . shared
326- . worker_map
327- . store ( Arc :: new ( build_worker_map ( metrics) ) ) ;
328- }
329-
330312 for batch in self . shared . collector . drain ( ) {
331313 for raw in batch {
332314 self . write_raw_event ( raw) . unwrap ( ) ;
@@ -371,19 +353,12 @@ impl TelemetryRecorder {
371353 flush_state : FlushState :: new ( ) ,
372354 } ) ) ;
373355
374- let s0 = shared. clone ( ) ;
375356 let s1 = shared. clone ( ) ;
376357 let s2 = shared. clone ( ) ;
377358 let s3 = shared. clone ( ) ;
378359 let s4 = shared. clone ( ) ;
379360
380361 builder
381- . on_thread_start ( move || {
382- let metrics_guard = s0. metrics . load ( ) ;
383- if let Some ( ref metrics) = * * metrics_guard {
384- s0. worker_map . store ( Arc :: new ( build_worker_map ( metrics) ) ) ;
385- }
386- } )
387362 . on_thread_park ( move || {
388363 let event = s1. make_worker_park ( ) ;
389364 s1. record_event ( event) ;
0 commit comments