Skip to content

Commit 49a62de

Browse files
committed
telemetry: add metrics to PDP operation failures
1 parent ffb44c7 commit 49a62de

File tree

6 files changed

+83
-1
lines changed

6 files changed

+83
-1
lines changed

pkg/pdp/service/roots_add.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ func (p *PDPService) AddRoots(ctx context.Context, id uint64, request []types.Ro
117117
log.Errorw("failed to add roots", "id", id, "request", request, "err", retErr)
118118
span.RecordError(retErr)
119119
span.SetStatus(codes.Error, "failed to add roots")
120+
PDPAddPieceFailureCounter.Inc(ctx)
120121
} else {
121122
span.SetAttributes(attribute.Stringer("tx", res))
122123
log.Infow("added roots", "id", id, "request", request, "response", res)

pkg/pdp/service/telemetry.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,32 @@ package service
22

33
import (
44
"go.opentelemetry.io/otel"
5+
6+
"github.com/storacha/piri/pkg/telemetry"
57
)
68

79
var (
810
tracer = otel.Tracer("github.com/storacha/piri/pkg/pdp/service")
911
)
12+
13+
var (
14+
PDPAddPieceFailureCounter *telemetry.Counter
15+
)
16+
17+
func init() {
18+
tel := telemetry.Global()
19+
newCounter := func(name, desc string) *telemetry.Counter {
20+
counter, err := tel.NewCounter(telemetry.CounterConfig{
21+
Name: name,
22+
Description: desc,
23+
})
24+
if err != nil {
25+
log.Warnw("failed to init telemetry counter", "name", name, "error", err)
26+
return nil
27+
}
28+
return counter
29+
}
30+
31+
PDPAddPieceFailureCounter = newCounter("pdp_add_piece_failure", "records failure to add a pdp piece")
32+
33+
}

pkg/pdp/tasks/next_pdp.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,11 @@ func adjustNextProveAt(nextProveAt int64, minRequiredEpoch int64, provingPeriod
146146

147147
func (n *NextProvingPeriodTask) Do(taskID scheduler.TaskID) (done bool, err error) {
148148
ctx := context.Background()
149+
defer func() {
150+
if err != nil {
151+
PDPNextFailureCounter.Inc(ctx)
152+
}
153+
}()
149154
// Select the proof set where challenge_request_task_id equals taskID and prove_at_epoch is not NULL
150155
var pdp models.PDPProofSet
151156
err = n.db.WithContext(ctx).

pkg/pdp/tasks/prove_pdp.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,11 @@ func NewProveTask(
162162

163163
func (p *ProveTask) Do(taskID scheduler.TaskID) (done bool, err error) {
164164
ctx := context.Background()
165+
defer func() {
166+
if err != nil {
167+
PDPProveFailureCounter.Inc(ctx)
168+
}
169+
}()
165170

166171
// Retrieve proof set and challenge epoch for the task
167172
var proveTask models.PDPProveTask

pkg/pdp/tasks/sender_eth.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,14 @@ import (
1212
ethtypes "github.com/ethereum/go-ethereum/core/types"
1313
"github.com/ethereum/go-ethereum/rpc"
1414
"github.com/storacha/filecoin-services/go/evmerrors"
15-
"github.com/storacha/piri/pkg/pdp/types"
15+
"go.opentelemetry.io/otel/attribute"
1616
"go.uber.org/multierr"
1717
"golang.org/x/xerrors"
1818
"gorm.io/gorm"
1919
"gorm.io/gorm/clause"
2020

21+
"github.com/storacha/piri/pkg/pdp/types"
22+
2123
"github.com/storacha/piri/pkg/pdp/promise"
2224
"github.com/storacha/piri/pkg/pdp/scheduler"
2325
"github.com/storacha/piri/pkg/pdp/service/models"
@@ -85,12 +87,20 @@ func (s *SenderETH) Send(ctx context.Context, fromAddress common.Address, tx *et
8587
if dataErr.ErrorData() != nil {
8688
if parsedErr, failure := evmerrors.ParseRevert(dataErr.ErrorData().(string)); failure == nil {
8789
log.Errorw("parsed contract revert during gas estimation", "error", parsedErr)
90+
// NB(forrest): ErrorSelector returns the contract error code as hex,
91+
// selector values are finite and bounded, so adding it to the counter keeps
92+
// cardinality low while giving actionable diagnostics
93+
MessageEstimateGasFailureCounter.Inc(ctx, attribute.String("selector",
94+
parsedErr.ErrorSelector()), attribute.String("method", reason))
8895
return common.Hash{}, types.NewError(types.KindInvalidInput, parsedErr.Error())
8996
} else {
9097
log.Warnw("failed to parse revert during gas estimation", "parse_error", failure, "original_error", err)
9198
}
9299
}
93100
}
101+
// NB(forrest): otherwise we consider the selector unknown
102+
MessageEstimateGasFailureCounter.Inc(ctx, attribute.String("selector", "unknown"),
103+
attribute.String("method", reason))
94104
return common.Hash{}, fmt.Errorf("failed to estimate gas: %w", err)
95105
}
96106
if gasLimit == 0 {
@@ -360,6 +370,7 @@ func (s *SendTaskETH) Do(taskID scheduler.TaskID) (done bool, err error) {
360370
var sendError string
361371
if err != nil {
362372
sendError = err.Error()
373+
MessageSendFailureCounter.Inc(ctx, attribute.String("method", dbTx.SendReason))
363374
}
364375

365376
err = s.db.Model(&models.MessageSendsEth{}).

pkg/pdp/tasks/telemetry.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package tasks
2+
3+
import (
4+
"github.com/storacha/piri/pkg/telemetry"
5+
)
6+
7+
var (
8+
MessageEstimateGasFailureCounter *telemetry.Counter
9+
MessageSendFailureCounter *telemetry.Counter
10+
PDPProveFailureCounter *telemetry.Counter
11+
PDPNextFailureCounter *telemetry.Counter
12+
PDPAddPieceFailureCounter *telemetry.Counter
13+
)
14+
15+
func init() {
16+
tel := telemetry.Global()
17+
newCounter := func(name, desc string) *telemetry.Counter {
18+
counter, err := tel.NewCounter(telemetry.CounterConfig{
19+
Name: name,
20+
Description: desc,
21+
})
22+
if err != nil {
23+
log.Warnw("failed to init telemetry counter", "name", name, "error", err)
24+
return nil
25+
}
26+
return counter
27+
}
28+
29+
MessageEstimateGasFailureCounter = newCounter("pdp_message_estimate_gas_failure",
30+
"records failure to estimate gas for sending messages; similar to a send failure")
31+
MessageSendFailureCounter = newCounter("pdp_message_send_failure", "records failure to send a message")
32+
PDPNextFailureCounter = newCounter("pdp_next_failure", "records failure in next pdp task")
33+
PDPProveFailureCounter = newCounter("pdp_prove_failure", "records failure to submit a pdp proof")
34+
PDPAddPieceFailureCounter = newCounter("pdp_add_piece_failure", "records failure to add a pdp piece")
35+
36+
}

0 commit comments

Comments
 (0)