@@ -2116,6 +2116,182 @@ async fn worker_retries_on_internal_error_and_fails_test() -> Result<(), Error>
21162116 Ok ( ( ) )
21172117}
21182118
2119+ /// Worker crash-loop regression: an action whose worker keeps disconnecting
2120+ /// (e.g. `OOMKill`) used to bypass `max_job_retries` because
2121+ /// `UpdateWithDisconnect` requeued without counting as an attempt. The build
2122+ /// would only terminate when the Bazel client's `--test_timeout` fired,
2123+ /// hiding the cluster-side root cause behind a TIMEOUT/NO STATUS surface.
2124+ /// After the fix, disconnects count as attempts and exceed the cap.
2125+ #[ nativelink_test]
2126+ async fn worker_disconnect_loop_caps_at_max_job_retries_test ( ) -> Result < ( ) , Error > {
2127+ let worker_id = WorkerId ( "worker_id" . to_string ( ) ) ;
2128+
2129+ let task_change_notify = Arc :: new ( Notify :: new ( ) ) ;
2130+ let ( scheduler, _worker_scheduler) = SimpleScheduler :: new_with_callback (
2131+ & SimpleSpec {
2132+ max_job_retries : 1 ,
2133+ ..Default :: default ( )
2134+ } ,
2135+ memory_awaited_action_db_factory (
2136+ 0 ,
2137+ & task_change_notify. clone ( ) ,
2138+ MockInstantWrapped :: default,
2139+ ) ,
2140+ || async move { } ,
2141+ task_change_notify,
2142+ MockInstantWrapped :: default,
2143+ None ,
2144+ ) ;
2145+ let action_digest = DigestInfo :: new ( [ 99u8 ; 32 ] , 512 ) ;
2146+
2147+ let mut rx_from_worker =
2148+ setup_new_worker ( & scheduler, worker_id. clone ( ) , PlatformProperties :: default ( ) ) . await ?;
2149+ let insert_timestamp = make_system_time ( 1 ) ;
2150+ let mut action_listener =
2151+ setup_action ( & scheduler, action_digest, HashMap :: new ( ) , insert_timestamp) . await ?;
2152+
2153+ let operation_id = {
2154+ let operation_id = match rx_from_worker. recv ( ) . await . unwrap ( ) . update {
2155+ Some ( update_for_worker:: Update :: StartAction ( exec) ) => exec. operation_id ,
2156+ v => panic ! ( "Expected StartAction, got : {v:?}" ) ,
2157+ } ;
2158+ assert_eq ! (
2159+ action_listener. changed( ) . await . unwrap( ) . 0 . stage,
2160+ ActionStage :: Executing
2161+ ) ;
2162+ OperationId :: from ( operation_id. as_str ( ) )
2163+ } ;
2164+
2165+ // First disconnect: should requeue (attempts=1, not yet > max_job_retries=1).
2166+ drop (
2167+ scheduler
2168+ . update_action (
2169+ & worker_id,
2170+ & operation_id,
2171+ UpdateOperationType :: UpdateWithDisconnect ,
2172+ )
2173+ . await ,
2174+ ) ;
2175+ {
2176+ let ( action_state, _maybe_origin_metadata) = action_listener. changed ( ) . await . unwrap ( ) ;
2177+ assert_eq ! (
2178+ action_state. stage,
2179+ ActionStage :: Queued ,
2180+ "First disconnect should requeue, got: {:?}" ,
2181+ action_state. stage,
2182+ ) ;
2183+ }
2184+
2185+ // Reattach worker so it picks up the requeued action.
2186+ let mut rx_from_worker =
2187+ setup_new_worker ( & scheduler, worker_id. clone ( ) , PlatformProperties :: default ( ) ) . await ?;
2188+ {
2189+ match rx_from_worker. recv ( ) . await . unwrap ( ) . update {
2190+ Some ( update_for_worker:: Update :: StartAction ( _) ) => { /* Success */ }
2191+ v => panic ! ( "Expected StartAction, got : {v:?}" ) ,
2192+ }
2193+ assert_eq ! (
2194+ action_listener. changed( ) . await . unwrap( ) . 0 . stage,
2195+ ActionStage :: Executing
2196+ ) ;
2197+ }
2198+
2199+ // Second disconnect: now attempts=2 > max_job_retries=1, so the action
2200+ // must transition to Completed with an error mentioning the disconnect
2201+ // loop, not silently requeue.
2202+ drop (
2203+ scheduler
2204+ . update_action (
2205+ & worker_id,
2206+ & operation_id,
2207+ UpdateOperationType :: UpdateWithDisconnect ,
2208+ )
2209+ . await ,
2210+ ) ;
2211+ {
2212+ let ( action_state, _maybe_origin_metadata) = action_listener. changed ( ) . await . unwrap ( ) ;
2213+ let ActionStage :: Completed ( action_result) = & action_state. stage else {
2214+ panic ! (
2215+ "Second disconnect should mark action Completed-with-error, got: {:?}" ,
2216+ action_state. stage
2217+ ) ;
2218+ } ;
2219+ let err = action_result
2220+ . error
2221+ . as_ref ( )
2222+ . expect ( "Completed action from disconnect cap must carry an error" ) ;
2223+ assert ! (
2224+ err. to_string( )
2225+ . contains( "Worker disconnected repeatedly while executing this action" ) ,
2226+ "Error message did not mention disconnect loop: {err}" ,
2227+ ) ;
2228+ }
2229+
2230+ Ok ( ( ) )
2231+ }
2232+
2233+ /// `Action.timeout` from the RBE protocol must be enforced backend-side.
2234+ /// Without this, an action that hangs forever only terminates when the
2235+ /// Bazel client's `--remote_timeout` (gRPC deadline) or `--test_timeout`
2236+ /// (client-side) fires; from the operator's perspective the cluster never
2237+ /// surfaces the slow action.
2238+ #[ nativelink_test]
2239+ async fn action_timeout_is_enforced_backend_side_test ( ) -> Result < ( ) , Error > {
2240+ use nativelink_scheduler:: awaited_action_db:: AwaitedAction ;
2241+ use nativelink_scheduler:: simple_scheduler_state_manager:: SimpleSchedulerStateManager ;
2242+
2243+ // Anchor MockClock so MockInstantWrapped::now() == make_system_time(0).
2244+ MockClock :: set_time ( Duration :: from_secs ( NOW_TIME ) ) ;
2245+ let executing_started_at = make_system_time ( 0 ) ;
2246+
2247+ let action_digest = DigestInfo :: new ( [ 7u8 ; 32 ] , 1 ) ;
2248+ let mut action_info = make_base_action_info ( executing_started_at, action_digest) ;
2249+ Arc :: make_mut ( & mut action_info) . timeout = Duration :: from_secs ( 2 ) ;
2250+
2251+ let operation_id = OperationId :: default ( ) ;
2252+ let mut awaited_action =
2253+ AwaitedAction :: new ( operation_id. clone ( ) , action_info, executing_started_at) ;
2254+ awaited_action. worker_set_state (
2255+ Arc :: new ( ActionState {
2256+ stage : ActionStage :: Executing ,
2257+ client_operation_id : operation_id,
2258+ action_digest,
2259+ last_transition_timestamp : executing_started_at,
2260+ } ) ,
2261+ executing_started_at,
2262+ ) ;
2263+
2264+ let task_change_notify = Arc :: new ( Notify :: new ( ) ) ;
2265+ let state_mgr = SimpleSchedulerStateManager :: new (
2266+ /* max_job_retries */ 1 ,
2267+ /* no_event_action_timeout */ Duration :: from_secs ( 60 ) ,
2268+ /* client_action_timeout */ Duration :: from_secs ( 60 ) ,
2269+ /* max_executing_timeout */ Duration :: ZERO ,
2270+ memory_awaited_action_db_factory (
2271+ 0 ,
2272+ & task_change_notify. clone ( ) ,
2273+ MockInstantWrapped :: default,
2274+ ) ,
2275+ MockInstantWrapped :: default,
2276+ /* worker_registry */ None ,
2277+ ) ;
2278+
2279+ assert ! (
2280+ !state_mgr. should_timeout_operation( & awaited_action) . await ,
2281+ "Should not time out before Action.timeout elapses" ,
2282+ ) ;
2283+
2284+ // Advance past the 2s per-action deadline.
2285+ MockClock :: advance ( Duration :: from_secs ( 5 ) ) ;
2286+
2287+ assert ! (
2288+ state_mgr. should_timeout_operation( & awaited_action) . await ,
2289+ "Scheduler must mark Executing action timed out once Action.timeout has elapsed" ,
2290+ ) ;
2291+
2292+ Ok ( ( ) )
2293+ }
2294+
21192295#[ nativelink_test]
21202296async fn ensure_scheduler_drops_inner_spawn ( ) -> Result < ( ) , Error > {
21212297 struct DropChecker {
0 commit comments