Skip to content

Commit 4d31ad4

Browse files
feat(shrex): metrics (#2095)
Closes #2143 Adds metrics to ShrexGetter, and ShrexEDS/ShrexND Client and Server (including shrex middleware)
1 parent 1329f8b commit 4d31ad4

File tree

14 files changed

+311
-28
lines changed

14 files changed

+311
-28
lines changed

nodebuilder/settings.go

+16-5
Original file line numberDiff line numberDiff line change
@@ -72,19 +72,30 @@ func WithMetrics(metricOpts []otlpmetrichttp.Option, nodeType node.Type) fx.Opti
7272
fx.Invoke(modheader.WithMetrics),
7373
)
7474

75+
samplingMetrics := fx.Options(
76+
fx.Invoke(das.WithMetrics),
77+
fx.Invoke(share.WithPeerManagerMetrics),
78+
fx.Invoke(share.WithShrexClientMetrics),
79+
fx.Invoke(share.WithShrexGetterMetrics),
80+
)
81+
7582
var opts fx.Option
7683
switch nodeType {
77-
case node.Full, node.Light:
84+
case node.Full:
85+
opts = fx.Options(
86+
baseComponents,
87+
fx.Invoke(share.WithShrexServerMetrics),
88+
samplingMetrics,
89+
)
90+
case node.Light:
7891
opts = fx.Options(
7992
baseComponents,
80-
fx.Invoke(das.WithMetrics),
81-
fx.Invoke(share.WithPeerManagerMetrics),
82-
// add more monitoring here
93+
samplingMetrics,
8394
)
8495
case node.Bridge:
8596
opts = fx.Options(
8697
baseComponents,
87-
// add more monitoring here
98+
fx.Invoke(share.WithShrexServerMetrics),
8899
)
89100
default:
90101
panic("invalid node type")

nodebuilder/share/config.go

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99

1010
type Config struct {
1111
// UseShareExchange is a flag toggling the usage of shrex protocols for blocksync.
12-
// NOTE: This config variable only has an effect on full and bridge nodes.
1312
UseShareExchange bool
1413
// ShrExEDSParams sets shrexeds client and server configuration parameters
1514
ShrExEDSParams *shrexeds.Parameters

nodebuilder/share/opts.go

+25
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,36 @@
11
package share
22

33
import (
4+
"github.com/celestiaorg/celestia-node/share/getters"
45
"github.com/celestiaorg/celestia-node/share/p2p/peers"
6+
"github.com/celestiaorg/celestia-node/share/p2p/shrexeds"
7+
"github.com/celestiaorg/celestia-node/share/p2p/shrexnd"
58
)
69

710
// WithPeerManagerMetrics is a utility function that is expected to be
811
// "invoked" by the fx lifecycle.
912
func WithPeerManagerMetrics(m *peers.Manager) error {
1013
return m.WithMetrics()
1114
}
15+
16+
func WithShrexClientMetrics(edsClient *shrexeds.Client, ndClient *shrexnd.Client) error {
17+
err := edsClient.WithMetrics()
18+
if err != nil {
19+
return err
20+
}
21+
22+
return ndClient.WithMetrics()
23+
}
24+
25+
func WithShrexServerMetrics(edsServer *shrexeds.Server, ndServer *shrexnd.Server) error {
26+
err := edsServer.WithMetrics()
27+
if err != nil {
28+
return err
29+
}
30+
31+
return ndServer.WithMetrics()
32+
}
33+
34+
func WithShrexGetterMetrics(sg *getters.ShrexGetter) error {
35+
return sg.WithMetrics()
36+
}

share/getters/shrex.go

+65
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ import (
66
"fmt"
77
"time"
88

9+
"go.opentelemetry.io/otel/attribute"
10+
"go.opentelemetry.io/otel/metric/global"
11+
"go.opentelemetry.io/otel/metric/instrument"
12+
"go.opentelemetry.io/otel/metric/instrument/syncint64"
13+
"go.opentelemetry.io/otel/metric/unit"
14+
915
"github.com/celestiaorg/nmt/namespace"
1016
"github.com/celestiaorg/rsmt2d"
1117

@@ -23,8 +29,61 @@ const (
2329
// serve getEDS request for block size 256
2430
defaultMinRequestTimeout = time.Minute // should be >= shrexeds server write timeout
2531
defaultMinAttemptsCount = 3
32+
metricObservationTimeout = 100 * time.Millisecond
2633
)
2734

35+
var meter = global.MeterProvider().Meter("shrex/getter")
36+
37+
type metrics struct {
38+
edsAttempts syncint64.Histogram
39+
ndAttempts syncint64.Histogram
40+
}
41+
42+
func (m *metrics) recordEDSAttempt(attemptCount int, success bool) {
43+
if m == nil {
44+
return
45+
}
46+
ctx, cancel := context.WithTimeout(context.Background(), metricObservationTimeout)
47+
defer cancel()
48+
m.edsAttempts.Record(ctx, int64(attemptCount), attribute.Bool("success", success))
49+
}
50+
51+
func (m *metrics) recordNDAttempt(attemptCount int, success bool) {
52+
if m == nil {
53+
return
54+
}
55+
56+
ctx, cancel := context.WithTimeout(context.Background(), metricObservationTimeout)
57+
defer cancel()
58+
m.ndAttempts.Record(ctx, int64(attemptCount), attribute.Bool("success", success))
59+
}
60+
61+
func (sg *ShrexGetter) WithMetrics() error {
62+
edsAttemptHistogram, err := meter.SyncInt64().Histogram(
63+
"getters_shrex_eds_attempts_per_request",
64+
instrument.WithUnit(unit.Dimensionless),
65+
instrument.WithDescription("Number of attempts per shrex/eds request"),
66+
)
67+
if err != nil {
68+
return err
69+
}
70+
71+
ndAttemptHistogram, err := meter.SyncInt64().Histogram(
72+
"getters_shrex_nd_attempts_per_request",
73+
instrument.WithUnit(unit.Dimensionless),
74+
instrument.WithDescription("Number of attempts per shrex/nd request"),
75+
)
76+
if err != nil {
77+
return err
78+
}
79+
80+
sg.metrics = &metrics{
81+
edsAttempts: edsAttemptHistogram,
82+
ndAttempts: ndAttemptHistogram,
83+
}
84+
return nil
85+
}
86+
2887
// ShrexGetter is a share.Getter that uses the shrex/eds and shrex/nd protocol to retrieve shares.
2988
type ShrexGetter struct {
3089
edsClient *shrexeds.Client
@@ -37,6 +96,8 @@ type ShrexGetter struct {
3796
// minAttemptsCount will be used to split request timeout into multiple attempts. It will allow to
3897
// attempt multiple peers in scope of one request before context timeout is reached
3998
minAttemptsCount int
99+
100+
metrics *metrics
40101
}
41102

42103
func NewShrexGetter(edsClient *shrexeds.Client, ndClient *shrexnd.Client, peerManager *peers.Manager) *ShrexGetter {
@@ -79,6 +140,7 @@ func (sg *ShrexGetter) GetEDS(ctx context.Context, root *share.Root) (*rsmt2d.Ex
79140
"hash", root.String(),
80141
"err", getErr,
81142
"finished (s)", time.Since(start))
143+
sg.metrics.recordEDSAttempt(attempt, false)
82144
return nil, fmt.Errorf("getter/shrex: %w", err)
83145
}
84146

@@ -89,6 +151,7 @@ func (sg *ShrexGetter) GetEDS(ctx context.Context, root *share.Root) (*rsmt2d.Ex
89151
switch {
90152
case getErr == nil:
91153
setStatus(peers.ResultSynced)
154+
sg.metrics.recordEDSAttempt(attempt, true)
92155
return eds, nil
93156
case errors.Is(getErr, context.DeadlineExceeded),
94157
errors.Is(getErr, context.Canceled):
@@ -135,6 +198,7 @@ func (sg *ShrexGetter) GetSharesByNamespace(
135198
"hash", root.String(),
136199
"err", getErr,
137200
"finished (s)", time.Since(start))
201+
sg.metrics.recordNDAttempt(attempt, false)
138202
return nil, fmt.Errorf("getter/shrex: %w", err)
139203
}
140204

@@ -145,6 +209,7 @@ func (sg *ShrexGetter) GetSharesByNamespace(
145209
switch {
146210
case getErr == nil:
147211
setStatus(peers.ResultNoop)
212+
sg.metrics.recordNDAttempt(attempt, true)
148213
return nd, nil
149214
case errors.Is(getErr, context.DeadlineExceeded),
150215
errors.Is(getErr, context.Canceled):

share/p2p/metrics.go

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package p2p
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"time"
7+
8+
"go.opentelemetry.io/otel/attribute"
9+
"go.opentelemetry.io/otel/metric/global"
10+
"go.opentelemetry.io/otel/metric/instrument"
11+
"go.opentelemetry.io/otel/metric/instrument/syncint64"
12+
"go.opentelemetry.io/otel/metric/unit"
13+
)
14+
15+
var meter = global.MeterProvider().Meter("shrex/eds")
16+
17+
var observationTimeout = 100 * time.Millisecond
18+
19+
type status string
20+
21+
const (
22+
StatusInternalErr status = "internal_err"
23+
StatusNotFound status = "not_found"
24+
StatusTimeout status = "timeout"
25+
StatusSuccess status = "success"
26+
StatusRateLimited status = "rate_limited"
27+
)
28+
29+
type Metrics struct {
30+
totalRequestCounter syncint64.Counter
31+
}
32+
33+
// ObserveRequests increments the total number of requests sent with the given status as an attribute.
34+
func (m *Metrics) ObserveRequests(count int64, status status) {
35+
if m == nil {
36+
return
37+
}
38+
39+
ctx, cancel := context.WithTimeout(context.Background(), observationTimeout)
40+
defer cancel()
41+
m.totalRequestCounter.Add(ctx, count, attribute.String("status", string(status)))
42+
}
43+
44+
func InitClientMetrics(protocol string) (*Metrics, error) {
45+
totalRequestCounter, err := meter.SyncInt64().Counter(
46+
fmt.Sprintf("shrex_%s_client_total_requests", protocol),
47+
instrument.WithUnit(unit.Dimensionless),
48+
instrument.WithDescription(fmt.Sprintf("Total count of sent shrex/%s requests", protocol)),
49+
)
50+
if err != nil {
51+
return nil, err
52+
}
53+
54+
return &Metrics{
55+
totalRequestCounter: totalRequestCounter,
56+
}, nil
57+
}
58+
59+
func InitServerMetrics(protocol string) (*Metrics, error) {
60+
totalRequestCounter, err := meter.SyncInt64().Counter(
61+
fmt.Sprintf("shrex_%s_server_total_responses", protocol),
62+
instrument.WithUnit(unit.Dimensionless),
63+
instrument.WithDescription(fmt.Sprintf("Total count of sent shrex/%s responses", protocol)),
64+
)
65+
if err != nil {
66+
return nil, err
67+
}
68+
69+
return &Metrics{
70+
totalRequestCounter: totalRequestCounter,
71+
}, nil
72+
}

share/p2p/middleware.go

+26-7
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,40 @@ import (
99

1010
var log = logging.Logger("shrex/middleware")
1111

12-
func RateLimitMiddleware(inner network.StreamHandler, concurrencyLimit int) network.StreamHandler {
13-
var parallelRequests int64
14-
limit := int64(concurrencyLimit)
12+
type Middleware struct {
13+
// concurrencyLimit is the maximum number of requests that can be processed at once.
14+
concurrencyLimit int64
15+
// parallelRequests is the number of requests currently being processed.
16+
parallelRequests atomic.Int64
17+
// numRateLimited is the number of requests that were rate limited.
18+
numRateLimited atomic.Int64
19+
}
20+
21+
func NewMiddleware(concurrencyLimit int) *Middleware {
22+
return &Middleware{
23+
concurrencyLimit: int64(concurrencyLimit),
24+
}
25+
}
26+
27+
// DrainCounter returns the current value of the rate limit counter and resets it to 0.
28+
func (m *Middleware) DrainCounter() int64 {
29+
return m.numRateLimited.Swap(0)
30+
}
31+
32+
func (m *Middleware) RateLimitHandler(handler network.StreamHandler) network.StreamHandler {
1533
return func(stream network.Stream) {
16-
current := atomic.AddInt64(&parallelRequests, 1)
17-
defer atomic.AddInt64(&parallelRequests, -1)
34+
current := m.parallelRequests.Add(1)
35+
defer m.parallelRequests.Add(-1)
1836

19-
if current > limit {
37+
if current > m.concurrencyLimit {
38+
m.numRateLimited.Add(1)
2039
log.Debug("concurrency limit reached")
2140
err := stream.Close()
2241
if err != nil {
2342
log.Debugw("server: closing stream", "err", err)
2443
}
2544
return
2645
}
27-
inner(stream)
46+
handler(stream)
2847
}
2948
}

share/p2p/shrexeds/client.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@ import (
2626
type Client struct {
2727
params *Parameters
2828
protocolID protocol.ID
29+
host host.Host
2930

30-
host host.Host
31+
metrics *p2p.Metrics
3132
}
3233

3334
// NewClient creates a new ShrEx/EDS client.
@@ -53,14 +54,17 @@ func (c *Client) RequestEDS(
5354
if err == nil {
5455
return eds, nil
5556
}
57+
log.Debugw("client: eds request to peer failed", "peer", peer, "hash", dataHash.String(), "error", err)
5658
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
59+
c.metrics.ObserveRequests(1, p2p.StatusTimeout)
5760
return nil, ctx.Err()
5861
}
5962
// some net.Errors also mean the context deadline was exceeded, but yamux/mocknet do not
6063
// unwrap to a ctx err
6164
var ne net.Error
6265
if errors.As(err, &ne) && ne.Timeout() {
6366
if deadline, _ := ctx.Deadline(); deadline.Before(time.Now()) {
67+
c.metrics.ObserveRequests(1, p2p.StatusTimeout)
6468
return nil, context.DeadlineExceeded
6569
}
6670
}
@@ -106,6 +110,7 @@ func (c *Client) doRequest(
106110
if err != nil {
107111
// server is overloaded and closed the stream
108112
if errors.Is(err, io.EOF) {
113+
c.metrics.ObserveRequests(1, p2p.StatusRateLimited)
109114
return nil, p2p.ErrNotFound
110115
}
111116
stream.Reset() //nolint:errcheck
@@ -119,15 +124,18 @@ func (c *Client) doRequest(
119124
if err != nil {
120125
return nil, fmt.Errorf("failed to read eds from ods bytes: %w", err)
121126
}
127+
c.metrics.ObserveRequests(1, p2p.StatusSuccess)
122128
return eds, nil
123129
case pb.Status_NOT_FOUND:
130+
c.metrics.ObserveRequests(1, p2p.StatusNotFound)
124131
return nil, p2p.ErrNotFound
125132
case pb.Status_INVALID:
126133
log.Debug("client: invalid request")
127134
fallthrough
128135
case pb.Status_INTERNAL:
129136
fallthrough
130137
default:
138+
c.metrics.ObserveRequests(1, p2p.StatusInternalErr)
131139
return nil, p2p.ErrInvalidResponse
132140
}
133141
}

share/p2p/shrexeds/exchange_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,9 @@ func TestExchange_RequestEDS(t *testing.T) {
107107
t.Fatal("timeout")
108108
}
109109
}
110+
middleware := p2p.NewMiddleware(rateLimit)
110111
server.host.SetStreamHandler(server.protocolID,
111-
p2p.RateLimitMiddleware(mockHandler, rateLimit))
112+
middleware.RateLimitHandler(mockHandler))
112113

113114
// take server concurrency slots with blocked requests
114115
for i := 0; i < rateLimit; i++ {

0 commit comments

Comments
 (0)