Skip to content

Commit 33abff8

Browse files
committed
Quicken transiently skipped heartbeats
1 parent 6ed7874 commit 33abff8

3 files changed

Lines changed: 30 additions & 3 deletions

File tree

crates/core/src/heartbeat/events.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ pub enum HeartbeatStatus {
1313
Ok,
1414
/// Heartbeat was skipped (outside active hours, empty file, etc.)
1515
Skipped,
16+
/// Heartbeat was skipped transiently (a soon retry may be useful)
17+
SkippedMayTry,
1618
/// Heartbeat failed with an error
1719
Failed,
1820
}

crates/core/src/heartbeat/runner.rs

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ impl HeartbeatRunner {
132132
first_at,
133133
);
134134

135+
// Exponential backoff for SkippedMayTry retries
136+
let mut skips_since_last = 0;
137+
let skip_retry_base = Duration::from_millis(1000);
138+
let skip_retry_max = self.interval / 2;
139+
135140
loop {
136141
interval.tick().await; // Sleep until next interval
137142

@@ -170,6 +175,16 @@ impl HeartbeatRunner {
170175
warn!(name: "Heartbeat", "response not OK: {}", response);
171176
}
172177

178+
if status == HeartbeatStatus::SkippedMayTry {
179+
skips_since_last += 1;
180+
let retry_after =
181+
(skip_retry_base * 2_u32.pow(skips_since_last)).min(skip_retry_max);
182+
interval.reset_after(retry_after);
183+
info!(name: "Heartbeat", "transient skip, retry quickly after: {:?}", retry_after);
184+
} else {
185+
skips_since_last = 0;
186+
}
187+
173188
HeartbeatEvent {
174189
ts: now_ms(),
175190
status: status.clone(),
@@ -248,15 +263,21 @@ impl HeartbeatRunner {
248263
&& gate.is_busy()
249264
{
250265
info!(name: "Heartbeat", "skipping: agent turn in flight (TurnGate busy)");
251-
return Ok((HEARTBEAT_OK_TOKEN.to_string(), HeartbeatStatus::Skipped));
266+
return Ok((
267+
HEARTBEAT_OK_TOKEN.to_string(),
268+
HeartbeatStatus::SkippedMayTry,
269+
));
252270
}
253271

254272
// Try to acquire the cross-process workspace lock (non-blocking)
255273
let _ws_guard = match self.workspace_lock.try_acquire()? {
256274
Some(guard) => guard,
257275
None => {
258276
info!(name: "Heartbeat", "skipping: workspace locked by another process");
259-
return Ok((HEARTBEAT_OK_TOKEN.to_string(), HeartbeatStatus::Skipped));
277+
return Ok((
278+
HEARTBEAT_OK_TOKEN.to_string(),
279+
HeartbeatStatus::SkippedMayTry,
280+
));
260281
}
261282
};
262283

@@ -267,7 +288,10 @@ impl HeartbeatRunner {
267288
Some(permit) => Some(permit),
268289
None => {
269290
info!(name: "Heartbeat", "skipping: agent turn started between check and acquire");
270-
return Ok((HEARTBEAT_OK_TOKEN.to_string(), HeartbeatStatus::Skipped));
291+
return Ok((
292+
HEARTBEAT_OK_TOKEN.to_string(),
293+
HeartbeatStatus::SkippedMayTry,
294+
));
271295
}
272296
}
273297
} else {

crates/server/src/http.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,7 @@ async fn heartbeat_status(State(state): State<Arc<AppState>>) -> Json<HeartbeatS
10641064
HeartbeatStatus::Sent => "sent",
10651065
HeartbeatStatus::Ok => "ok",
10661066
HeartbeatStatus::Skipped => "skipped",
1067+
HeartbeatStatus::SkippedMayTry => "skipped",
10671068
HeartbeatStatus::Failed => "failed",
10681069
};
10691070

0 commit comments

Comments
 (0)