Skip to content

Commit a4da302

Browse files
committed
Quicken transiently skipped heartbeats
1 parent 6ed7874 commit a4da302

3 files changed

Lines changed: 38 additions & 9 deletions

File tree

crates/core/src/heartbeat/events.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ pub enum HeartbeatStatus {
1313
Ok,
1414
/// Heartbeat was skipped (outside active hours, empty file, etc.)
1515
Skipped,
16+
/// Heartbeat was skipped transiently (a soon retry may be useful)
17+
SkippedMayTry,
1618
/// Heartbeat failed with an error
1719
Failed,
1820
}

crates/core/src/heartbeat/runner.rs

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ impl HeartbeatRunner {
132132
first_at,
133133
);
134134

135+
// Exponential backoff for SkippedMayTry retries
136+
let mut skips_since_last = 0;
137+
let skip_retry_base = Duration::from_millis(1000);
138+
let skip_retry_max = self.interval / 2;
139+
135140
loop {
136141
interval.tick().await; // Sleep until next interval
137142

@@ -170,6 +175,16 @@ impl HeartbeatRunner {
170175
warn!(name: "Heartbeat", "response not OK: {}", response);
171176
}
172177

178+
if status == HeartbeatStatus::SkippedMayTry {
179+
skips_since_last += 1;
180+
let retry_after =
181+
(skip_retry_base * 2_u32.pow(skips_since_last)).min(skip_retry_max);
182+
interval.reset_after(retry_after);
183+
info!(name: "Heartbeat", "transient skip, retry quickly after: {:?}", retry_after);
184+
} else {
185+
skips_since_last = 0;
186+
}
187+
173188
HeartbeatEvent {
174189
ts: now_ms(),
175190
status: status.clone(),
@@ -190,12 +205,14 @@ impl HeartbeatRunner {
190205
}
191206
};
192207

193-
// Persist last heartbeat event to disk
194-
if let Err(e) = serde_json::to_writer_pretty(
195-
fs::File::create(self.config.paths.last_heartbeat())?,
196-
&event,
197-
) {
198-
warn!(name: "Heartbeat", "failed to write event: {}", e);
208+
// Persist any non-transient heartbeat event to disk
209+
if event.status != HeartbeatStatus::SkippedMayTry {
210+
if let Err(e) = serde_json::to_writer_pretty(
211+
fs::File::create(self.config.paths.last_heartbeat())?,
212+
&event,
213+
) {
214+
warn!(name: "Heartbeat", "failed to write event: {}", e);
215+
}
199216
}
200217

201218
emit_heartbeat_event(event);
@@ -248,15 +265,21 @@ impl HeartbeatRunner {
248265
&& gate.is_busy()
249266
{
250267
info!(name: "Heartbeat", "skipping: agent turn in flight (TurnGate busy)");
251-
return Ok((HEARTBEAT_OK_TOKEN.to_string(), HeartbeatStatus::Skipped));
268+
return Ok((
269+
HEARTBEAT_OK_TOKEN.to_string(),
270+
HeartbeatStatus::SkippedMayTry,
271+
));
252272
}
253273

254274
// Try to acquire the cross-process workspace lock (non-blocking)
255275
let _ws_guard = match self.workspace_lock.try_acquire()? {
256276
Some(guard) => guard,
257277
None => {
258278
info!(name: "Heartbeat", "skipping: workspace locked by another process");
259-
return Ok((HEARTBEAT_OK_TOKEN.to_string(), HeartbeatStatus::Skipped));
279+
return Ok((
280+
HEARTBEAT_OK_TOKEN.to_string(),
281+
HeartbeatStatus::SkippedMayTry,
282+
));
260283
}
261284
};
262285

@@ -267,7 +290,10 @@ impl HeartbeatRunner {
267290
Some(permit) => Some(permit),
268291
None => {
269292
info!(name: "Heartbeat", "skipping: agent turn started between check and acquire");
270-
return Ok((HEARTBEAT_OK_TOKEN.to_string(), HeartbeatStatus::Skipped));
293+
return Ok((
294+
HEARTBEAT_OK_TOKEN.to_string(),
295+
HeartbeatStatus::SkippedMayTry,
296+
));
271297
}
272298
}
273299
} else {

crates/server/src/http.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,7 @@ async fn heartbeat_status(State(state): State<Arc<AppState>>) -> Json<HeartbeatS
10641064
HeartbeatStatus::Sent => "sent",
10651065
HeartbeatStatus::Ok => "ok",
10661066
HeartbeatStatus::Skipped => "skipped",
1067+
HeartbeatStatus::SkippedMayTry => "skipped",
10671068
HeartbeatStatus::Failed => "failed",
10681069
};
10691070

0 commit comments

Comments
 (0)