Skip to content

Commit 26c5ca7

Browse files
committed
fix: don't record failures on context cancellation
Gate the case where the retriever (parent) cancels a protocol retriever because another has completed successfully. If a retrieval fails in this case, it's not a failure that we should either report in the event system, or record against the SP's performance stats. Fixes: #364 I believe this fixes #364, but it's hard to reproduce reliably so I can't be sure.
1 parent 175ec2a commit 26c5ca7

File tree

2 files changed

+23
-10
lines changed

2 files changed

+23
-10
lines changed

pkg/retriever/bitswapretriever.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package retriever
33
import (
44
"bytes"
55
"context"
6+
"errors"
67
"fmt"
78
"io"
89
"math"
@@ -242,8 +243,14 @@ func (br *bitswapRetrieval) RetrieveFromAsyncCandidates(ayncCandidates types.Inb
242243
br.routing.RemoveProviders(br.request.RetrievalID)
243244
br.bstore.RemoveLinkSystem(br.request.RetrievalID)
244245
if err != nil {
245-
// record failure
246-
br.events(events.FailedRetrieval(br.clock.Now(), br.request.RetrievalID, bitswapCandidate, multicodec.TransportBitswap, err.Error()))
246+
// If the local context was canceled and the parent wasn't, it could be from a block timeout
247+
// (see lastBytesReceivedTimer), in which case it needs to be recorded as a failure.
248+
// However, if the parent context has an error we want to exclude cancellation errors
249+
// since the failure could be because another protocol finished, not because of a bitswap
250+
// problem.
251+
if br.ctx.Err() == nil || !errors.Is(err, context.Canceled) {
252+
br.events(events.FailedRetrieval(br.clock.Now(), br.request.RetrievalID, bitswapCandidate, multicodec.TransportBitswap, err.Error()))
253+
}
247254
return nil, err
248255
}
249256
duration := br.clock.Since(startTime)

pkg/retriever/parallelpeerretriever.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,9 @@ func (retrieval *retrieval) runRetrievalCandidate(
327327
// Setup in parallel
328328
connectTime, err := retrieval.Protocol.Connect(connectCtx, retrieval, startTime, candidate)
329329
if err != nil {
330-
if ctx.Err() == nil { // not cancelled, maybe timed out though
330+
// Exclude the case where the context was cancelled by the parent, whichikely means that
331+
// another protocol has succeeded.
332+
if ctx.Err() == nil || !errors.Is(err, context.Canceled) {
331333
logger.Warnf("Failed to connect to SP %s on protocol %s: %v", candidate.MinerPeer.ID, retrieval.Protocol.Code().String(), err)
332334
retrievalErr = fmt.Errorf("%w: %v", ErrConnectFailed, err)
333335
if err := retrieval.Session.RecordFailure(retrieval.request.RetrievalID, candidate.MinerPeer.ID); err != nil {
@@ -347,13 +349,17 @@ func (retrieval *retrieval) runRetrievalCandidate(
347349
stats, retrievalErr = retrieval.Protocol.Retrieve(ctx, retrieval, shared, timeout, candidate)
348350

349351
if retrievalErr != nil {
350-
msg := retrievalErr.Error()
351-
if errors.Is(retrievalErr, ErrRetrievalTimedOut) {
352-
msg = fmt.Sprintf("timeout after %s", timeout)
353-
}
354-
shared.sendEvent(ctx, events.FailedRetrieval(retrieval.parallelPeerRetriever.Clock.Now(), retrieval.request.RetrievalID, candidate, retrieval.Protocol.Code(), msg))
355-
if err := retrieval.Session.RecordFailure(retrieval.request.RetrievalID, candidate.MinerPeer.ID); err != nil {
356-
logger.Errorf("Error recording retrieval failure for protocol %s: %v", retrieval.Protocol.Code().String(), err)
352+
// Exclude the case where the context was cancelled by the parent, which likely
353+
// means that another protocol has succeeded.
354+
if ctx.Err() == nil || !errors.Is(err, context.Canceled) {
355+
msg := retrievalErr.Error()
356+
if errors.Is(retrievalErr, ErrRetrievalTimedOut) {
357+
msg = fmt.Sprintf("timeout after %s", timeout)
358+
}
359+
shared.sendEvent(ctx, events.FailedRetrieval(retrieval.parallelPeerRetriever.Clock.Now(), retrieval.request.RetrievalID, candidate, retrieval.Protocol.Code(), msg))
360+
if err := retrieval.Session.RecordFailure(retrieval.request.RetrievalID, candidate.MinerPeer.ID); err != nil {
361+
logger.Errorf("Error recording retrieval failure for protocol %s: %v", retrieval.Protocol.Code().String(), err)
362+
}
357363
}
358364
} else {
359365
shared.sendEvent(ctx, events.Success(

0 commit comments

Comments
 (0)