@@ -132,6 +132,11 @@ impl HeartbeatRunner {
132132 first_at,
133133 ) ;
134134
135+ // Exponential backoff for SkippedMayTry retries
136+ let mut skips_since_last = 0 ;
137+ let skip_retry_base = Duration :: from_millis ( 1000 ) ;
138+ let skip_retry_max = self . interval / 2 ;
139+
135140 loop {
136141 interval. tick ( ) . await ; // Sleep until next interval
137142
@@ -170,6 +175,16 @@ impl HeartbeatRunner {
170175 warn ! ( name: "Heartbeat" , "response not OK: {}" , response) ;
171176 }
172177
178+ if status == HeartbeatStatus :: SkippedMayTry {
179+ skips_since_last += 1 ;
180+ let retry_after =
181+ ( skip_retry_base * 2_u32 . pow ( skips_since_last) ) . min ( skip_retry_max) ;
182+ interval. reset_after ( retry_after) ;
183+ info ! ( name: "Heartbeat" , "transient skip, retry quickly after: {:?}" , retry_after) ;
184+ } else {
185+ skips_since_last = 0 ;
186+ }
187+
173188 HeartbeatEvent {
174189 ts : now_ms ( ) ,
175190 status : status. clone ( ) ,
@@ -190,12 +205,14 @@ impl HeartbeatRunner {
190205 }
191206 } ;
192207
193- // Persist last heartbeat event to disk
194- if let Err ( e) = serde_json:: to_writer_pretty (
195- fs:: File :: create ( self . config . paths . last_heartbeat ( ) ) ?,
196- & event,
197- ) {
198- warn ! ( name: "Heartbeat" , "failed to write event: {}" , e) ;
208+ // Persist any non-transient heartbeat event to disk
209+ if event. status != HeartbeatStatus :: SkippedMayTry {
210+ if let Err ( e) = serde_json:: to_writer_pretty (
211+ fs:: File :: create ( self . config . paths . last_heartbeat ( ) ) ?,
212+ & event,
213+ ) {
214+ warn ! ( name: "Heartbeat" , "failed to write event: {}" , e) ;
215+ }
199216 }
200217
201218 emit_heartbeat_event ( event) ;
@@ -248,15 +265,21 @@ impl HeartbeatRunner {
248265 && gate. is_busy ( )
249266 {
250267 info ! ( name: "Heartbeat" , "skipping: agent turn in flight (TurnGate busy)" ) ;
251- return Ok ( ( HEARTBEAT_OK_TOKEN . to_string ( ) , HeartbeatStatus :: Skipped ) ) ;
268+ return Ok ( (
269+ HEARTBEAT_OK_TOKEN . to_string ( ) ,
270+ HeartbeatStatus :: SkippedMayTry ,
271+ ) ) ;
252272 }
253273
254274 // Try to acquire the cross-process workspace lock (non-blocking)
255275 let _ws_guard = match self . workspace_lock . try_acquire ( ) ? {
256276 Some ( guard) => guard,
257277 None => {
258278 info ! ( name: "Heartbeat" , "skipping: workspace locked by another process" ) ;
259- return Ok ( ( HEARTBEAT_OK_TOKEN . to_string ( ) , HeartbeatStatus :: Skipped ) ) ;
279+ return Ok ( (
280+ HEARTBEAT_OK_TOKEN . to_string ( ) ,
281+ HeartbeatStatus :: SkippedMayTry ,
282+ ) ) ;
260283 }
261284 } ;
262285
@@ -267,7 +290,10 @@ impl HeartbeatRunner {
267290 Some ( permit) => Some ( permit) ,
268291 None => {
269292 info ! ( name: "Heartbeat" , "skipping: agent turn started between check and acquire" ) ;
270- return Ok ( ( HEARTBEAT_OK_TOKEN . to_string ( ) , HeartbeatStatus :: Skipped ) ) ;
293+ return Ok ( (
294+ HEARTBEAT_OK_TOKEN . to_string ( ) ,
295+ HeartbeatStatus :: SkippedMayTry ,
296+ ) ) ;
271297 }
272298 }
273299 } else {
0 commit comments