Skip to content

Commit f04e3a1

Browse files
fix: adjust lease renewal error handling
Change the lease renewal error handling to only retry on network errors again. Running experiments with #279 did show that all failed leases were due to before hidden error cases. With this change we'll drop workers without an active lease (i.e. due to a failed task) faster and reduce noise in our logs.
1 parent 9d7f55d commit f04e3a1

File tree

1 file changed

+38
-40
lines changed

1 file changed

+38
-40
lines changed

crates/scheduler/src/worker.rs

Lines changed: 38 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -99,61 +99,29 @@ impl Worker {
9999
} else {
100100
Duration::from_secs(1)
101101
};
102-
// NOTE: We want to retry the lease renewal until the remaining time has elapsed
102+
// NOTE: We retry network errors until the remaining time has elapsed.
103103
let retry_strategy = FixedInterval::from_millis(200)
104104
.map(jitter)
105105
.take(remaining_time.as_millis() as usize / 200);
106106

107107
let result = Retry::spawn(retry_strategy, || {
108108
let network = network.clone();
109109
async move {
110-
match network
110+
network
111111
.request::<api::Codec>(
112112
peer_id,
113113
api::Request::RenewLease(renew_lease::Request { id: lease_id }),
114114
)
115115
.await
116-
{
117-
Ok(api::Response::RenewLease(renew_lease::Response::Renewed {
118-
timeout,
119-
..
120-
})) => Ok(timeout),
121-
Ok(api::Response::RenewLease(renew_lease::Response::NotFound)) => {
122-
tracing::error!(%lease_id, %peer_id,
123-
"Lease renewal failed with lease not found");
124-
Err(api::Response::RenewLease(renew_lease::Response::NotFound))
125-
}
126-
Ok(api::Response::RenewLease(renew_lease::Response::Failed)) => {
127-
tracing::error!(%lease_id, %peer_id,
128-
"Lease renewal failed");
129-
130-
Err(api::Response::RenewLease(renew_lease::Response::NotFound))
131-
}
132-
Ok(api::Response::RenewLease(renew_lease::Response::Forbidden)) => {
133-
tracing::error!(%lease_id, %peer_id,
134-
"Lease renewal forbidden");
135-
136-
Err(api::Response::RenewLease(renew_lease::Response::NotFound))
137-
}
138-
Err(error) => {
139-
tracing::error!(%lease_id, %peer_id, error=%error,
140-
"Lease renewal request failed");
141-
142-
Err(api::Response::RenewLease(renew_lease::Response::Failed))
143-
}
144-
_ => {
145-
tracing::error!(%lease_id, %peer_id,
146-
"Lease renewal with unexpected response");
147-
148-
Err(api::Response::RenewLease(renew_lease::Response::Failed))
149-
}
150-
}
151116
}
152117
})
153118
.await;
154119

155120
match result {
156-
Ok(timeout) => {
121+
Ok(api::Response::RenewLease(renew_lease::Response::Renewed {
122+
timeout,
123+
..
124+
})) => {
157125
last_timeout = Some(timeout);
158126
let duration = timeout
159127
.duration_since(SystemTime::now())
@@ -174,14 +142,44 @@ impl Worker {
174142

175143
sleep(safe_duration).await;
176144
}
145+
Ok(api::Response::RenewLease(renew_lease::Response::NotFound)) => {
146+
tracing::error!(
147+
%lease_id,
148+
%peer_id,
149+
"Lease renewal failed: lease not found"
150+
);
151+
152+
return Err(WorkerError::LeaseExpired);
153+
}
154+
Ok(api::Response::RenewLease(renew_lease::Response::Failed)) => {
155+
tracing::error!(%lease_id, %peer_id, "Lease renewal failed");
156+
157+
return Err(WorkerError::LeaseExpired);
158+
}
159+
Ok(api::Response::RenewLease(renew_lease::Response::Forbidden)) => {
160+
tracing::error!(%lease_id, %peer_id, "Lease renewal forbidden");
161+
162+
return Err(WorkerError::LeaseExpired);
163+
}
164+
Ok(response) => {
165+
tracing::error!(
166+
%lease_id,
167+
%peer_id,
168+
response = ?response,
169+
"Lease renewal returned unexpected response"
170+
);
171+
172+
return Err(WorkerError::LeaseExpired);
173+
}
177174
Err(error) => {
178175
tracing::warn!(
179176
%lease_id,
180177
%peer_id,
181-
error = ?error,
178+
error = %error,
182179
"Lease renewal failed after retries"
183180
);
184-
return Err(WorkerError::LeaseExpired);
181+
182+
return Err(WorkerError::NetworkError(error));
185183
}
186184
}
187185
}

0 commit comments

Comments
 (0)