@@ -22,7 +22,7 @@ use futures::Future;
2222use futures:: stream:: { FuturesUnordered , StreamExt , unfold} ;
2323use nativelink_config:: stores:: Retry ;
2424use nativelink_error:: { Code , Error , make_err} ;
25- use tokio:: sync:: { mpsc, oneshot} ;
25+ use tokio:: sync:: { OwnedSemaphorePermit , Semaphore , mpsc, oneshot} ;
2626use tonic:: transport:: { Channel , Endpoint , channel} ;
2727use tracing:: { debug, error, info, warn} ;
2828
@@ -95,8 +95,13 @@ struct ConnectionManagerWorker {
9595 endpoints : Vec < ( ConnectionIndex , Endpoint ) > ,
9696 /// The channel used to communicate between a Connection and the worker.
9797 connection_tx : mpsc:: UnboundedSender < ConnectionRequest > ,
98- /// The number of connections that are currently allowed to be made.
99- available_connections : usize ,
98+ /// Gates the maximum number of in-flight `Connection` objects.
99+ /// Was an explicit `usize` counter; now an `Arc<Semaphore>` so the
100+ /// `OwnedSemaphorePermit` held by each `Connection` releases on
101+ /// drop (RAII), instead of relying on a `ConnectionRequest::Dropped`
102+ /// round-trip that could be lost on tonic transport errors or task
103+ /// aborts.
104+ available_connections : Arc < Semaphore > ,
100105 /// Channels that are currently being connected.
101106 connecting_channels : FuturesUnordered < Pin < Box < dyn Future < Output = IndexedChannel > + Send > > > ,
102107 /// Connected channels that are available for use.
@@ -136,14 +141,16 @@ impl ConnectionManager {
136141 . collect ( ) ;
137142
138143 if max_concurrent_requests == 0 {
139- max_concurrent_requests = usize:: MAX ;
144+ max_concurrent_requests = Semaphore :: MAX_PERMITS ;
145+ } else {
146+ max_concurrent_requests = max_concurrent_requests. min ( Semaphore :: MAX_PERMITS ) ;
140147 }
141148 if connections_per_endpoint == 0 {
142149 connections_per_endpoint = 1 ;
143150 }
144151 let worker = ConnectionManagerWorker {
145152 endpoints,
146- available_connections : max_concurrent_requests,
153+ available_connections : Arc :: new ( Semaphore :: new ( max_concurrent_requests) ) ,
147154 connection_tx,
148155 connecting_channels : FuturesUnordered :: new ( ) ,
149156 available_channels : VecDeque :: new ( ) ,
@@ -309,15 +316,15 @@ impl ConnectionManagerWorker {
309316
310317 // This must never be made async otherwise the select may cancel it.
311318 fn handle_worker ( & mut self , reason : String , tx : oneshot:: Sender < Connection > ) {
312- if let Some ( channel ) = ( self . available_connections > 0 )
313- . then_some ( ( ) )
314- . and_then ( | ( ) | self . available_channels . pop_front ( ) )
319+ let maybe_permit = self . available_connections . clone ( ) . try_acquire_owned ( ) . ok ( ) ;
320+ if let Some ( permit ) = maybe_permit
321+ && let Some ( channel ) = self . available_channels . pop_front ( )
315322 {
316323 debug ! ( reason, "ConnectionManager: request running" ) ;
317- self . provide_channel ( channel, tx) ;
324+ self . provide_channel ( channel, tx, permit ) ;
318325 } else {
319326 debug ! (
320- available_connections = self . available_connections,
327+ available_permits = self . available_connections. available_permits ( ) ,
321328 available_channels = self . available_channels. len( ) ,
322329 waiting_connections = self . waiting_connections. len( ) ,
323330 reason,
@@ -327,31 +334,36 @@ impl ConnectionManagerWorker {
327334 }
328335 }
329336
330- fn provide_channel ( & mut self , channel : EstablishedChannel , tx : oneshot:: Sender < Connection > ) {
331- // We decrement here because we create Connection, this will signal when
332- // it is Dropped and therefore increment this again.
333- self . available_connections -= 1 ;
337+ fn provide_channel (
338+ & mut self ,
339+ channel : EstablishedChannel ,
340+ tx : oneshot:: Sender < Connection > ,
341+ permit : OwnedSemaphorePermit ,
342+ ) {
334343 drop ( tx. send ( Connection {
335344 tx : self . connection_tx . clone ( ) ,
336345 pending_channel : Some ( channel. channel . clone ( ) ) ,
337346 channel,
347+ _permit : permit,
338348 } ) ) ;
339349 }
340350
341351 fn maybe_available_connection ( & mut self ) {
342- while self . available_connections > 0
343- && !self . waiting_connections . is_empty ( )
344- && !self . available_channels . is_empty ( )
345- {
346- if let Some ( channel) = self . available_channels . pop_front ( ) {
347- if let Some ( ( reason, tx) ) = self . waiting_connections . pop_front ( ) {
348- debug ! ( reason, "ConnectionManager: channel available, running" ) ;
349- self . provide_channel ( channel, tx) ;
350- } else {
351- // This should never happen, but better than an unwrap.
352- self . available_channels . push_front ( channel) ;
353- }
354- }
352+ while !self . waiting_connections . is_empty ( ) && !self . available_channels . is_empty ( ) {
353+ let Some ( permit) = self . available_connections . clone ( ) . try_acquire_owned ( ) . ok ( ) else {
354+ break ;
355+ } ;
356+ let Some ( channel) = self . available_channels . pop_front ( ) else {
357+ drop ( permit) ;
358+ break ;
359+ } ;
360+ let Some ( ( reason, tx) ) = self . waiting_connections . pop_front ( ) else {
361+ self . available_channels . push_front ( channel) ;
362+ drop ( permit) ;
363+ break ;
364+ } ;
365+ debug ! ( reason, "ConnectionManager: channel available, running" ) ;
366+ self . provide_channel ( channel, tx, permit) ;
355367 }
356368 }
357369
@@ -362,7 +374,6 @@ impl ConnectionManagerWorker {
362374 if let Some ( channel) = maybe_channel {
363375 self . available_channels . push_back ( channel) ;
364376 }
365- self . available_connections += 1 ;
366377 self . maybe_available_connection ( ) ;
367378 }
368379 ConnectionRequest :: Connected ( channel) => {
@@ -394,7 +405,8 @@ impl ConnectionManagerWorker {
394405/// re-connecting the underlying channel on error. It depends on users
395406/// reporting all errors.
396407/// NOTE: This should never be cloneable because its lifetime is linked to the
397- /// `ConnectionManagerWorker::available_connections`.
408+ /// semaphore permit it carries — `_permit` is released exactly once,
409+ /// when the `Connection` drops.
398410#[ derive( Debug ) ]
399411pub struct Connection {
400412 /// Communication with `ConnectionManagerWorker` to inform about transport
@@ -406,6 +418,7 @@ pub struct Connection {
406418 pending_channel : Option < Channel > ,
407419 /// The identifier to send to `tx`.
408420 channel : EstablishedChannel ,
421+ _permit : OwnedSemaphorePermit ,
409422}
410423
411424impl Drop for Connection {
0 commit comments