Skip to content

Commit 745d3d1

Browse files
authored
Merge pull request #110 from guggero/ignore-timeout
collectors: don't shut down on timeout on `GetInfo` RPC call
2 parents 892dae7 + e1c2d96 commit 745d3d1

File tree

4 files changed

+85
-14
lines changed

4 files changed

+85
-14
lines changed

collectors/chain_collector.go

+14-2
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
7070
func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
7171
resp, err := c.lnd.GetInfo(context.Background())
7272
if err != nil {
73-
c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
74-
"%v", err)
73+
errWithContext := fmt.Errorf("ChainCollector GetInfo "+
74+
"failed with: %w", err)
75+
Logger.Error(errWithContext)
76+
77+
// If this isn't just a timeout, we'll want to exit to give the
78+
// runtime (Docker/k8s/systemd) a chance to restart us, in case
79+
// something with the lnd connection and/or credentials has
80+
// changed. We just do this check for the GetInfo call, since
81+
// that's known to sometimes randomly take way longer than on
82+
// average (database interactions?).
83+
if !IsDeadlineExceeded(err) {
84+
c.errChan <- errWithContext
85+
}
86+
7587
return
7688
}
7789

collectors/channels_collector.go

+14-2
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
311311
// have open.
312312
getInfoResp, err := c.lnd.GetInfo(context.Background())
313313
if err != nil {
314-
c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
315-
"with: %v", err)
314+
errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+
315+
"failed with: %w", err)
316+
Logger.Error(errWithContext)
317+
318+
// If this isn't just a timeout, we'll want to exit to give the
319+
// runtime (Docker/k8s/systemd) a chance to restart us, in case
320+
// something with the lnd connection and/or credentials has
321+
// changed. We just do this check for the GetInfo call, since
322+
// that's known to sometimes randomly take way longer than on
323+
// average (database interactions?).
324+
if !IsDeadlineExceeded(err) {
325+
c.errChan <- errWithContext
326+
}
327+
316328
return
317329
}
318330

collectors/errors.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package collectors
2+
3+
import (
4+
"context"
5+
"strings"
6+
7+
"google.golang.org/grpc/codes"
8+
"google.golang.org/grpc/status"
9+
)
10+
11+
var (
12+
// errRPCDeadlineExceeded is the error that is sent over the gRPC
13+
// interface when it's coming from the server side. The
14+
// status.FromContextError() function won't recognize it correctly
15+
// since the error sent over the wire is a string and not a structured
16+
// error anymore.
17+
errRPCDeadlineExceeded = status.Error(
18+
codes.DeadlineExceeded, context.DeadlineExceeded.Error(),
19+
)
20+
)
21+
22+
// IsDeadlineExceeded returns true if the passed error is a gRPC error with the
23+
// context.DeadlineExceeded error as the cause.
24+
func IsDeadlineExceeded(err error) bool {
25+
if err == nil {
26+
return false
27+
}
28+
29+
st := status.FromContextError(err)
30+
if st.Code() == codes.DeadlineExceeded {
31+
return true
32+
}
33+
34+
if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) {
35+
return true
36+
}
37+
38+
return false
39+
}

collectors/prometheus.go

+18-10
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ type MonitoringConfig struct {
7272
// DisableHtlc disables collection of HTLCs metrics
7373
DisableHtlc bool
7474

75-
// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was started.
75+
// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was
76+
// started.
7677
ProgramStartTime time.Time
7778
}
7879

@@ -88,13 +89,14 @@ func DefaultConfig() *PrometheusConfig {
8889
// NewPrometheusExporter makes a new instance of the PrometheusExporter given
8990
// the address to listen for Prometheus on and an lnd gRPC client.
9091
func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
91-
monitoringCfg *MonitoringConfig, quitChan chan struct{}) *PrometheusExporter {
92+
monitoringCfg *MonitoringConfig,
93+
quitChan chan struct{}) *PrometheusExporter {
9294

9395
// We have six collectors and a htlc monitor running, so we buffer our
94-
// error channel by 7 so that we do not need to consume all errors from
96+
// error channel by 8 so that we do not need to consume all errors from
9597
// this channel (on the first one, we'll start shutting down, but a few
9698
// could arrive quickly in the case where lnd is shutting down).
97-
errChan := make(chan error, 7)
99+
errChan := make(chan error, 8)
98100

99101
htlcMonitor := newHtlcMonitor(lnd.Router, errChan)
100102

@@ -116,7 +118,9 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
116118
}
117119

118120
if !monitoringCfg.DisableGraph {
119-
collectors = append(collectors, NewGraphCollector(lnd.Client, errChan))
121+
collectors = append(
122+
collectors, NewGraphCollector(lnd.Client, errChan),
123+
)
120124
}
121125

122126
return &PrometheusExporter{
@@ -165,15 +169,19 @@ func (p *PrometheusExporter) Start() error {
165169
// scape our metrics.
166170
go func() {
167171
errorLogger := log.New(
168-
os.Stdout, "promhttp", log.Ldate|log.Ltime|log.Lshortfile,
172+
os.Stdout, "promhttp",
173+
log.Ldate|log.Ltime|log.Lshortfile,
169174
)
170175

171176
promHandler := promhttp.InstrumentMetricHandler(
172177
prometheus.DefaultRegisterer,
173-
promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
174-
ErrorLog: errorLogger,
175-
ErrorHandling: promhttp.ContinueOnError,
176-
}),
178+
promhttp.HandlerFor(
179+
prometheus.DefaultGatherer,
180+
promhttp.HandlerOpts{
181+
ErrorLog: errorLogger,
182+
ErrorHandling: promhttp.ContinueOnError,
183+
},
184+
),
177185
)
178186
http.Handle("/metrics", promHandler)
179187
Logger.Info(http.ListenAndServe(p.cfg.ListenAddr, nil))

0 commit comments

Comments
 (0)