Skip to content

Commit 4a6f466

Browse files
committed
Survive transient hosted node-agent registration flaps
Three layers of defence against the residual flakiness left after the reconcile-poison fix: 1. Read-side machine binding now tolerates stale registrations. resolve_machine_binding's stored-placement path used to return the binding only when resolve_known_node_binding accepted the freshness check. A new resolve_known_node_binding_for_read returns the stored binding regardless of TTL, and resolve_machine_binding now uses it as the optimistic fallback on a stale Err. If the underlying node-agent is still reachable the proxy call succeeds; if it isn't, the actual proxy failure is more informative than a synthetic stale rejection. State-changing callers (placement reconcile, machine launch) keep using the strict resolve_known_node_binding. 2. store_registered_node_refresh tolerates 60s of refreshed_at regression. The previous strict check rejected any refresh whose refreshed_at was less than the stored value, so a single NTP step-correction or modest cross-host clock skew would silently wedge a node for the rest of its TTL window every time. Now refreshes within NODE_AGENT_REGISTRATION_REFRESHED_AT_REGRESSION_TOLERANCE_SECONDS (60s) are accepted; only genuine attempts to overwrite the stored high-water mark with much-older state are rejected. The node_agent_surfaces_explicit_registration_failures test now uses a 600s future high-water mark to keep exercising the wildly-stale path. 3. NODE_AGENT_REGISTRATION_TTL_SECONDS raised from 45s to 120s. Combined with the unchanged 5s refresh interval, a node now tolerates ~24 missed refreshes (was ~9) before going stale. The previous 9-cycle ceiling left no headroom for normal cross-AZ network jitter; brief HTTP slowdowns repeatedly pushed aws-linux-node-2 into the stale window during ordinary load. Observed on prod 2026-05-06 as intermittent inspector reports of 'k3s-agent on cloud-aws-worker-2 as unreachable' while the K3s cluster itself was healthy and serving pods.
1 parent 490edb3 commit 4a6f466

1 file changed

Lines changed: 114 additions & 22 deletions

File tree

crates/port-runtime/src/hosted_control_plane.rs

Lines changed: 114 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -424,8 +424,13 @@ const WEDGE_DETECTOR_INTERVAL: Duration = Duration::from_secs(10);
424424
#[cfg(test)]
425425
const NODE_AGENT_REGISTRATION_TTL_SECONDS: u64 = 3;
426426
#[cfg(not(test))]
427-
const NODE_AGENT_REGISTRATION_TTL_SECONDS: u64 = 45;
427+
const NODE_AGENT_REGISTRATION_TTL_SECONDS: u64 = 120;
428428
const NODE_AGENT_REGISTRATION_HTTP_TIMEOUT: Duration = Duration::from_secs(15);
429+
// Tolerate registration `refreshed_at` regressing by up to this many seconds
430+
// against the previously-stored value. NTP step-corrections and modest
431+
// cross-host clock skew otherwise wedge a node for the remainder of its
432+
// freshness window every time the wall clock moves backward.
433+
const NODE_AGENT_REGISTRATION_REFRESHED_AT_REGRESSION_TOLERANCE_SECONDS: u64 = 60;
429434

430435
// Hosted K3s service apply can legitimately run for several minutes while a
431436
// freshly relaunched control plane settles.
@@ -4189,13 +4194,20 @@ fn store_registered_node_refresh(
41894194
)
41904195
})?
41914196
.clone();
4192-
if let Some(existing) = current_state.nodes.get(node_name)
4193-
&& registration.refreshed_at < existing.refreshed_at
4194-
{
4195-
return Err(format!(
4196-
"control plane '{}' rejected stale registration refresh for node '{}': refreshed_at {} is older than current {}",
4197-
state.inner.control_plane, node_name, registration.refreshed_at, existing.refreshed_at
4198-
));
4197+
if let Some(existing) = current_state.nodes.get(node_name) {
4198+
let regression = existing
4199+
.refreshed_at
4200+
.saturating_sub(registration.refreshed_at);
4201+
if regression > NODE_AGENT_REGISTRATION_REFRESHED_AT_REGRESSION_TOLERANCE_SECONDS {
4202+
return Err(format!(
4203+
"control plane '{}' rejected stale registration refresh for node '{}': refreshed_at {} is older than current {} by {regression}s (> {}s tolerance)",
4204+
state.inner.control_plane,
4205+
node_name,
4206+
registration.refreshed_at,
4207+
existing.refreshed_at,
4208+
NODE_AGENT_REGISTRATION_REFRESHED_AT_REGRESSION_TOLERANCE_SECONDS
4209+
));
4210+
}
41994211
}
42004212

42014213
let mut next_state = current_state;
@@ -4829,6 +4841,52 @@ fn resolve_known_node_binding(
48294841
Ok(None)
48304842
}
48314843

4844+
/// Read-side variant of [`resolve_known_node_binding`] that returns the
4845+
/// stored binding regardless of registration freshness.
4846+
///
4847+
/// Inspector / status / service-list callers should prefer this so a brief
4848+
/// flap in the freshness window doesn't 502 every machine-keyed query for
4849+
/// the affected node — if the underlying node-agent is still reachable the
4850+
/// proxy call succeeds, and if it isn't, the actual proxy failure is more
4851+
/// informative than a synthetic stale rejection. State-changing callers
4852+
/// (placement reconcile, machine launch) should keep using the strict
4853+
/// freshness check via `resolve_known_node_binding`.
4854+
fn resolve_known_node_binding_for_read(
4855+
state: &ControlPlaneState,
4856+
node_name: &str,
4857+
) -> Option<(HostedNodeBinding, PathBuf)> {
4858+
if let Some(record) = state
4859+
.inner
4860+
.registered_nodes
4861+
.read()
4862+
.ok()?
4863+
.get(node_name)
4864+
.cloned()
4865+
{
4866+
return Some((record.binding, record.contract.node.runtime_root.clone()));
4867+
}
4868+
state
4869+
.inner
4870+
.static_node_bindings
4871+
.get(node_name)
4872+
.cloned()
4873+
.map(|binding| {
4874+
let runtime_root = state
4875+
.inner
4876+
.config
4877+
.nodes
4878+
.get(node_name)
4879+
.map(|node| node.runtime_root.clone())
4880+
.unwrap_or_else(|| {
4881+
hosted_placeholder_runtime_root_for_config(
4882+
&state.inner.config,
4883+
&state.inner.control_plane,
4884+
)
4885+
});
4886+
(binding, runtime_root)
4887+
})
4888+
}
4889+
48324890
fn resolve_machine_binding(
48334891
state: &ControlPlaneState,
48344892
summary: &HostedMachineSummaryContract,
@@ -4847,24 +4905,52 @@ fn resolve_machine_binding(
48474905
let stored_route = stored_machine_route_context(summary, &placement);
48484906
match resolve_known_node_binding(state, &placement.node_name) {
48494907
Ok(Some((binding, _))) => return Ok((Some(binding), stored_route, None)),
4850-
Ok(None) | Err(_) => {
4851-
let stored_issue = match resolve_known_node_binding(state, &placement.node_name) {
4852-
Ok(None) => machine_placement_detail(
4853-
&stored_route,
4854-
format!(
4855-
"stored placement points at node '{}' but the control plane has no live registered node-agent endpoint for it.",
4856-
placement.node_name
4857-
),
4858-
),
4859-
Err(message) => machine_placement_detail(
4908+
Err(stale_message) => {
4909+
// The freshness check rejected the binding, but a stored
4910+
// placement record still points at this node. Hand back
4911+
// the stored binding optimistically — if the node-agent
4912+
// is reachable the proxy call succeeds, and if it isn't
4913+
// the proxy failure is more informative than a synthetic
4914+
// stale rejection. Brief TTL flaps shouldn't 502 every
4915+
// read for that machine.
4916+
if let Some((binding, _)) =
4917+
resolve_known_node_binding_for_read(state, &placement.node_name)
4918+
{
4919+
let warning = machine_placement_detail(
48604920
&stored_route,
48614921
format!(
4862-
"stored placement on node '{}' is not currently usable: {message}",
4922+
"stored placement on node '{}' is currently flagged stale; using stored binding optimistically: {stale_message}",
48634923
placement.node_name
48644924
),
4925+
);
4926+
return Ok((Some(binding), stored_route, Some(warning)));
4927+
}
4928+
let stored_issue = machine_placement_detail(
4929+
&stored_route,
4930+
format!(
4931+
"stored placement on node '{}' is not currently usable: {stale_message}",
4932+
placement.node_name
48654933
),
4866-
Ok(Some(_)) => String::new(),
4934+
);
4935+
return match resolve_node_binding(state, summary) {
4936+
Ok((binding, route)) => Ok((Some(binding), route, None)),
4937+
Err((route, message)) => Ok((
4938+
None,
4939+
route,
4940+
Some(format!(
4941+
"{stored_issue} Fallback candidate routing also failed: {message}"
4942+
)),
4943+
)),
48674944
};
4945+
}
4946+
Ok(None) => {
4947+
let stored_issue = machine_placement_detail(
4948+
&stored_route,
4949+
format!(
4950+
"stored placement points at node '{}' but the control plane has no live registered node-agent endpoint for it.",
4951+
placement.node_name
4952+
),
4953+
);
48684954
return match resolve_node_binding(state, summary) {
48694955
Ok((binding, route)) => Ok((Some(binding), route, None)),
48704956
Err((route, message)) => Ok((
@@ -12110,8 +12196,14 @@ mod tests {
1211012196
HostedNodeRegistration {
1211112197
endpoint: String::from("http://127.0.0.1:9234"),
1211212198
token: String::from("node-secret"),
12113-
registered_at: now + 60,
12114-
refreshed_at: now + 60,
12199+
// Pre-existing refreshed_at sits beyond the
12200+
// NODE_AGENT_REGISTRATION_REFRESHED_AT_REGRESSION_TOLERANCE_SECONDS
12201+
// window so the node-agent's `now`-stamped
12202+
// refresh is rejected as wildly stale rather
12203+
// than silently overwriting a far-future
12204+
// high-water mark.
12205+
registered_at: now + 600,
12206+
refreshed_at: now + 600,
1211512207
ttl_seconds: 30,
1211612208
},
1211712209
)]),

0 commit comments

Comments
 (0)