Skip to content

Commit e1c2d96

Browse files
committed
collectors: don't shut down on GetInfo timeout
Since the lnd GetInfo call sometimes takes way longer than average, we don't want to shut down on just a timeout.
1 parent c5cb503 commit e1c2d96

File tree

3 files changed

+67
-4
lines changed

3 files changed

+67
-4
lines changed

collectors/chain_collector.go

+14-2
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
7070
func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
7171
resp, err := c.lnd.GetInfo(context.Background())
7272
if err != nil {
73-
c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
74-
"%v", err)
73+
errWithContext := fmt.Errorf("ChainCollector GetInfo "+
74+
"failed with: %w", err)
75+
Logger.Error(errWithContext)
76+
77+
// If this isn't just a timeout, we'll want to exit to give the
78+
// runtime (Docker/k8s/systemd) a chance to restart us, in case
79+
// something with the lnd connection and/or credentials has
80+
// changed. We just do this check for the GetInfo call, since
81+
// that's known to sometimes randomly take way longer than on
82+
// average (database interactions?).
83+
if !IsDeadlineExceeded(err) {
84+
c.errChan <- errWithContext
85+
}
86+
7587
return
7688
}
7789

collectors/channels_collector.go

+14-2
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
311311
// have open.
312312
getInfoResp, err := c.lnd.GetInfo(context.Background())
313313
if err != nil {
314-
c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
315-
"with: %v", err)
314+
errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+
315+
"failed with: %w", err)
316+
Logger.Error(errWithContext)
317+
318+
// If this isn't just a timeout, we'll want to exit to give the
319+
// runtime (Docker/k8s/systemd) a chance to restart us, in case
320+
// something with the lnd connection and/or credentials has
321+
// changed. We just do this check for the GetInfo call, since
322+
// that's known to sometimes randomly take way longer than on
323+
// average (database interactions?).
324+
if !IsDeadlineExceeded(err) {
325+
c.errChan <- errWithContext
326+
}
327+
316328
return
317329
}
318330

collectors/errors.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package collectors
2+
3+
import (
4+
"context"
5+
"strings"
6+
7+
"google.golang.org/grpc/codes"
8+
"google.golang.org/grpc/status"
9+
)
10+
11+
var (
12+
// errRPCDeadlineExceeded is the error that is sent over the gRPC
13+
// interface when it's coming from the server side. The
14+
// status.FromContextError() function won't recognize it correctly
15+
// since the error sent over the wire is a string and not a structured
16+
// error anymore.
17+
errRPCDeadlineExceeded = status.Error(
18+
codes.DeadlineExceeded, context.DeadlineExceeded.Error(),
19+
)
20+
)
21+
22+
// IsDeadlineExceeded returns true if the passed error is a gRPC error with the
23+
// context.DeadlineExceeded error as the cause.
24+
func IsDeadlineExceeded(err error) bool {
25+
if err == nil {
26+
return false
27+
}
28+
29+
st := status.FromContextError(err)
30+
if st.Code() == codes.DeadlineExceeded {
31+
return true
32+
}
33+
34+
if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) {
35+
return true
36+
}
37+
38+
return false
39+
}

0 commit comments

Comments
 (0)