-
Notifications
You must be signed in to change notification settings - Fork 9
fix(publisher): filter ACK quorum candidates by <contextGraphsServed>
#556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
2bff487
1c4c371
ce7bb42
c127553
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,27 @@ export interface ACKCollectorDeps { | |
| gossipPublish: (topic: string, data: Uint8Array) => Promise<void>; | ||
| sendP2P: (peerId: string, protocol: string, data: Uint8Array) => Promise<Uint8Array>; | ||
| getConnectedCorePeers: () => string[]; | ||
| /** | ||
| * Optional hosting filter. Given the target context-graph UAL, return | ||
| * the subset of currently-connected core peers whose published agent | ||
| * profile (`<https://dkg.origintrail.io/skill#contextGraphsServed>`) | ||
| * advertises hosting it. | ||
| * | ||
| * When provided, the collector prefers this filtered candidate set — | ||
| * cores that don't host the CG would otherwise have to reject the | ||
| * StorageACK request mid-stream (most often as `No data found in SWM | ||
| * graph ...`), which the publisher sees as a libp2p stream reset and | ||
| * has to retry/timeout against. Filtering up front avoids that cost. | ||
| * | ||
| * Behaviour when fewer than `requiredACKs` peers match: the collector | ||
| * logs a warning naming the CG + which connected cores are vs. aren't | ||
| * advertising it, and **falls back to the full connected-core set** so | ||
| * publishes still proceed when the hosting registry is incomplete or | ||
| * stale. This deliberately keeps the path live during discovery races, | ||
| * but makes hosting-coverage bugs (see GitHub issue #541) visible in | ||
| * the log instead of presenting as opaque ACK timeouts. | ||
| */ | ||
| getCorePeersHostingContextGraph?: (cgIdStr: string) => string[] | Promise<string[]>; | ||
| verifyIdentity?: (recoveredAddress: string, claimedIdentityId: bigint) => Promise<boolean>; | ||
| log?: (msg: string) => void; | ||
| } | ||
|
|
@@ -32,6 +53,17 @@ export interface ACKCollectionResult { | |
| const DEFAULT_REQUIRED_ACKS = 3; | ||
| const ACK_TIMEOUT_MS = 120_000; | ||
| const MAX_RETRIES = 3; | ||
| /** | ||
| * Hard ceiling for the optional `getCorePeersHostingContextGraph` | ||
| * lookup. The lookup runs against the local triple store BEFORE the | ||
| * `ACK_TIMEOUT_MS` budget begins, so an unbounded await here can block | ||
| * a publish indefinitely if the store is under load or the query | ||
| * implementation hangs (Codex Review on PR#556). On timeout the | ||
| * collector treats the lookup as "no hosting signal" — falling back to | ||
| * the legacy single-wave behaviour against all connected cores — | ||
| * rather than escalating into a publish failure. | ||
| */ | ||
| const HOSTING_FILTER_TIMEOUT_MS = 1_500; | ||
|
|
||
| /** | ||
| * ACKCollector implements V10 spec §9.0 Phase 3: collecting 3 core node | ||
|
|
@@ -122,16 +154,95 @@ export class ACKCollector { | |
| // that decode payloads as FinalizationMessages, causing decode errors. | ||
| log(`[ACKCollector] Collecting ACKs via direct P2P (merkleRoot=${ethers.hexlify(merkleRoot).slice(0, 18)}...)`); | ||
|
|
||
| const corePeers = this.deps.getConnectedCorePeers(); | ||
| if (corePeers.length === 0) { | ||
| const allConnected = this.deps.getConnectedCorePeers(); | ||
| if (allConnected.length === 0) { | ||
| throw new Error('ACK collection failed: no connected core peers'); | ||
| } | ||
| if (corePeers.length < REQUIRED_ACKS) { | ||
|
|
||
| // Split connected cores into two waves: | ||
| // priorityPeers — advertise hosting the target CG; tried first. | ||
| // fallbackPeers — connected but don't advertise; only tried if | ||
| // the priority wave can't satisfy quorum. | ||
| // | ||
| // Cores outside the priority set are NOT a hard gate (a stale or | ||
| // missing advertisement on one peer in `priorityPeers` shouldn't | ||
| // be able to fail a publish that the rest of the connected pool | ||
| // could have satisfied — Codex Review on PR#556 flagged this). | ||
| // Wave 2 only fires when wave 1 doesn't reach quorum, so the happy | ||
| // path still avoids dialling cores that would just decline / reset | ||
| // the stream (the GitHub #541 cost). | ||
| let priorityPeers: string[] = allConnected; | ||
| let fallbackPeers: string[] = []; | ||
| if (this.deps.getCorePeersHostingContextGraph) { | ||
| let hostingPeers: string[] = []; | ||
| try { | ||
| const lookupPromise = Promise.resolve( | ||
| this.deps.getCorePeersHostingContextGraph(contextGraphIdStr), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Issue: The hosting lookup is keyed by |
||
| ); | ||
| // Bound the local-store lookup so a slow / hung registry query | ||
| // can't block the publish before the ACK_TIMEOUT_MS budget even | ||
| // begins (Codex Review on PR#556). On timeout we treat the | ||
| // result as "no hosting signal" — i.e. fall back to the legacy | ||
| // single-wave path against all connected cores. | ||
| let timeoutHandle: ReturnType<typeof setTimeout> | undefined; | ||
| const timeoutSentinel: unique symbol = Symbol('hosting-filter-timeout'); | ||
| const timeoutPromise = new Promise<typeof timeoutSentinel>(resolve => { | ||
| timeoutHandle = setTimeout(() => resolve(timeoutSentinel), HOSTING_FILTER_TIMEOUT_MS); | ||
| }); | ||
| const settled = await Promise.race([lookupPromise, timeoutPromise]); | ||
| if (timeoutHandle) clearTimeout(timeoutHandle); | ||
| if (settled === timeoutSentinel) { | ||
| log( | ||
| `[ACKCollector] hosting-filter lookup did not return within ${HOSTING_FILTER_TIMEOUT_MS}ms for "${contextGraphIdStr}"; ` + | ||
| `dialling all ${allConnected.length} connected cores in a single wave`, | ||
| ); | ||
| hostingPeers = []; | ||
| } else { | ||
| hostingPeers = settled; | ||
| } | ||
| } catch (err) { | ||
| const errMsg = err instanceof Error ? err.message : String(err); | ||
| log( | ||
| `[ACKCollector] hosting-filter lookup failed for "${contextGraphIdStr}" (${errMsg}); ` + | ||
| `dialling all ${allConnected.length} connected cores in a single wave`, | ||
| ); | ||
| hostingPeers = []; | ||
| } | ||
| const hostingSet = new Set(hostingPeers); | ||
| const matched = allConnected.filter(p => hostingSet.has(p)); | ||
| const excluded = allConnected.filter(p => !hostingSet.has(p)); | ||
|
|
||
| if (matched.length === 0) { | ||
| // No hosting signal at all — keep the legacy single-wave shape. | ||
| log( | ||
| `[ACKCollector] hosting filter: no connected cores advertise hosting "${contextGraphIdStr}"; ` + | ||
| `dialling all ${allConnected.length} connected cores in a single wave (expect declines from non-hosting cores).`, | ||
| ); | ||
| } else { | ||
| priorityPeers = matched; | ||
| fallbackPeers = excluded; | ||
| const includedTag = matched.map(p => p.slice(-8)).join(', '); | ||
| const excludedTag = excluded.length > 0 ? excluded.map(p => p.slice(-8)).join(', ') : '<none>'; | ||
| log( | ||
| `[ACKCollector] hosting filter: priority wave = ${matched.length}/${allConnected.length} cores advertising "${contextGraphIdStr}" [${includedTag}]; ` + | ||
| `fallback wave = ${excluded.length} non-advertising cores [${excludedTag}].`, | ||
| ); | ||
| if (matched.length < REQUIRED_ACKS) { | ||
| log( | ||
| `[ACKCollector] WARN: only ${matched.length} connected cores advertise hosting "${contextGraphIdStr}" (need ${REQUIRED_ACKS}); ` + | ||
| `fallback wave will be dialled if priority wave can't satisfy quorum. ` + | ||
| `If "${contextGraphIdStr}" has replicationPolicy=full, the non-advertising cores have a coverage bug (see GitHub issue #541).`, | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (allConnected.length < REQUIRED_ACKS) { | ||
| throw new Error( | ||
| `ACK collection failed: need ${REQUIRED_ACKS} ACKs but only ${corePeers.length} core peers connected — quorum impossible`, | ||
| `ACK collection failed: need ${REQUIRED_ACKS} ACKs but only ${allConnected.length} core peers connected — quorum impossible`, | ||
| ); | ||
| } | ||
| log(`[ACKCollector] Requesting ACKs from ${corePeers.length} core peers (need ${REQUIRED_ACKS})`); | ||
| log(`[ACKCollector] Requesting ACKs from ${allConnected.length} core peers (need ${REQUIRED_ACKS}; priority=${priorityPeers.length}, fallback=${fallbackPeers.length})`); | ||
|
|
||
| const ackDigest = computePublishACKDigest( | ||
| chainId, | ||
|
|
@@ -202,22 +313,41 @@ export class ACKCollector { | |
| let quorumResolve: (() => void) | undefined; | ||
| const quorumPromise = new Promise<void>(resolve => { quorumResolve = resolve; }); | ||
|
|
||
| /** | ||
| * Dial a wave of peers in parallel, accumulating their ACKs into | ||
| * the shared `collected` slot. Stops early when the publisher has | ||
| * `REQUIRED_ACKS` distinct (peer, identity) ACKs. | ||
| */ | ||
| const dialWave = async (peers: string[]): Promise<void> => { | ||
| if (peers.length === 0) return; | ||
| const promises = peers.map(async (peerId) => { | ||
| if (collected.length >= REQUIRED_ACKS) return; | ||
| const ack = await requestACK(peerId); | ||
| if (ack && !seenPeers.has(ack.peerId) && !seenIdentityIds.has(ack.nodeIdentityId)) { | ||
| seenPeers.add(ack.peerId); | ||
| seenIdentityIds.add(ack.nodeIdentityId); | ||
| collected.push(ack); | ||
| if (collected.length >= REQUIRED_ACKS) { | ||
| quorumResolve?.(); | ||
| } | ||
| } | ||
| }); | ||
| await Promise.race([Promise.allSettled(promises), quorumPromise]); | ||
| }; | ||
|
|
||
| let triedPeerCount = 0; | ||
| await Promise.race([ | ||
| (async () => { | ||
| const promises = corePeers.map(async (peerId) => { | ||
| if (collected.length >= REQUIRED_ACKS) return; | ||
| const ack = await requestACK(peerId); | ||
| if (ack && !seenPeers.has(ack.peerId) && !seenIdentityIds.has(ack.nodeIdentityId)) { | ||
| seenPeers.add(ack.peerId); | ||
| seenIdentityIds.add(ack.nodeIdentityId); | ||
| collected.push(ack); | ||
| if (collected.length >= REQUIRED_ACKS) { | ||
| quorumResolve?.(); | ||
| return; | ||
| } | ||
| } | ||
| }); | ||
| await Promise.race([Promise.allSettled(promises), quorumPromise]); | ||
| await dialWave(priorityPeers); | ||
| triedPeerCount = priorityPeers.length; | ||
| if (collected.length < REQUIRED_ACKS && fallbackPeers.length > 0) { | ||
| log( | ||
| `[ACKCollector] Priority wave settled with ${collected.length}/${REQUIRED_ACKS} ACKs; ` + | ||
| `dialling fallback wave (${fallbackPeers.length} non-advertising core(s))`, | ||
| ); | ||
| await dialWave(fallbackPeers); | ||
| triedPeerCount += fallbackPeers.length; | ||
| } | ||
| })(), | ||
| new Promise<never>((_, reject) => | ||
| setTimeout(() => reject(new Error(`storage_ack_timeout: only ${collected.length}/${REQUIRED_ACKS} ACKs received within ${ACK_TIMEOUT_MS}ms`)), | ||
|
|
@@ -229,7 +359,7 @@ export class ACKCollector { | |
| if (collected.length < REQUIRED_ACKS) { | ||
| throw new Error( | ||
| `storage_ack_insufficient: got ${collected.length}/${REQUIRED_ACKS} valid ACKs. ` + | ||
| `Tried ${corePeers.length} core peers.`, | ||
| `Tried ${triedPeerCount} core peers.`, | ||
| ); | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 Issue: Catching and returning
[]here hides real resolver/store failures fromACKCollector, so production regressions become indistinguishable from a legitimate 'no hosting signal' fallback. The collector already handles failures safely and logs the cause; either let the error propagate to it, or log the exception before returning[]here (same issue inpackages/cli/src/daemon/lifecycle.ts).