Merge pull request #110 from guggero/ignore-timeout

guggero · web-flow · commit 745d3d1e2437 · 2024-08-07T09:59:39.000-06:00
collectors: don't shut down on timeout on `GetInfo` RPC call
diff --git a/collectors/chain_collector.go b/collectors/chain_collector.go
@@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
 func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
 	resp, err := c.lnd.GetInfo(context.Background())
 	if err != nil {
-		c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
-			"%v", err)
+		errWithContext := fmt.Errorf("ChainCollector GetInfo "+
+			"failed with: %w", err)
+		Logger.Error(errWithContext)
+
+		// If this isn't just a timeout, we'll want to exit to give the
+		// runtime (Docker/k8s/systemd) a chance to restart us, in case
+		// something with the lnd connection and/or credentials has
+		// changed. We just do this check for the GetInfo call, since
+		// that's known to sometimes randomly take way longer than on
+		// average (database interactions?).
+		if !IsDeadlineExceeded(err) {
+			c.errChan <- errWithContext
+		}
+
 		return
 	}
 
diff --git a/collectors/channels_collector.go b/collectors/channels_collector.go
@@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
 	// have open.
 	getInfoResp, err := c.lnd.GetInfo(context.Background())
 	if err != nil {
-		c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
-			"with: %v", err)
+		errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+
+			"failed with: %w", err)
+		Logger.Error(errWithContext)
+
+		// If this isn't just a timeout, we'll want to exit to give the
+		// runtime (Docker/k8s/systemd) a chance to restart us, in case
+		// something with the lnd connection and/or credentials has
+		// changed. We just do this check for the GetInfo call, since
+		// that's known to sometimes randomly take way longer than on
+		// average (database interactions?).
+		if !IsDeadlineExceeded(err) {
+			c.errChan <- errWithContext
+		}
+
 		return
 	}
 
diff --git a/collectors/errors.go b/collectors/errors.go
@@ -0,0 +1,39 @@
+package collectors
+
+import (
+	"context"
+	"strings"
+
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+var (
+	// errRPCDeadlineExceeded is the error that is sent over the gRPC
+	// interface when it's coming from the server side. The
+	// status.FromContextError() function won't recognize it correctly
+	// since the error sent over the wire is a string and not a structured
+	// error anymore.
+	errRPCDeadlineExceeded = status.Error(
+		codes.DeadlineExceeded, context.DeadlineExceeded.Error(),
+	)
+)
+
+// IsDeadlineExceeded returns true if the passed error is a gRPC error with the
+// context.DeadlineExceeded error as the cause.
+func IsDeadlineExceeded(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	st := status.FromContextError(err)
+	if st.Code() == codes.DeadlineExceeded {
+		return true
+	}
+
+	if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) {
+		return true
+	}
+
+	return false
+}
diff --git a/collectors/prometheus.go b/collectors/prometheus.go
@@ -72,7 +72,8 @@ type MonitoringConfig struct {
 	// DisableHtlc disables collection of HTLCs metrics
 	DisableHtlc bool
 
-	// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was started.
+	// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was
+	// started.
 	ProgramStartTime time.Time
 }
 
@@ -88,13 +89,14 @@ func DefaultConfig() *PrometheusConfig {
 // NewPrometheusExporter makes a new instance of the PrometheusExporter given
 // the address to listen for Prometheus on and an lnd gRPC client.
 func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
-	monitoringCfg *MonitoringConfig, quitChan chan struct{}) *PrometheusExporter {
+	monitoringCfg *MonitoringConfig,
+	quitChan chan struct{}) *PrometheusExporter {
 
 	// We have six collectors and a htlc monitor running, so we buffer our
-	// error channel by 7 so that we do not need to consume all errors from
+	// error channel by 8 so that we do not need to consume all errors from
 	// this channel (on the first one, we'll start shutting down, but a few
 	// could arrive quickly in the case where lnd is shutting down).
-	errChan := make(chan error, 7)
+	errChan := make(chan error, 8)
 
 	htlcMonitor := newHtlcMonitor(lnd.Router, errChan)
 
@@ -116,7 +118,9 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
 	}
 
 	if !monitoringCfg.DisableGraph {
-		collectors = append(collectors, NewGraphCollector(lnd.Client, errChan))
+		collectors = append(
+			collectors, NewGraphCollector(lnd.Client, errChan),
+		)
 	}
 
 	return &PrometheusExporter{
@@ -165,15 +169,19 @@ func (p *PrometheusExporter) Start() error {
 	// scape our metrics.
 	go func() {
 		errorLogger := log.New(
-			os.Stdout, "promhttp", log.Ldate|log.Ltime|log.Lshortfile,
+			os.Stdout, "promhttp",
+			log.Ldate|log.Ltime|log.Lshortfile,
 		)
 
 		promHandler := promhttp.InstrumentMetricHandler(
 			prometheus.DefaultRegisterer,
-			promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
-				ErrorLog:      errorLogger,
-				ErrorHandling: promhttp.ContinueOnError,
-			}),
+			promhttp.HandlerFor(
+				prometheus.DefaultGatherer,
+				promhttp.HandlerOpts{
+					ErrorLog:      errorLogger,
+					ErrorHandling: promhttp.ContinueOnError,
+				},
+			),
 		)
 		http.Handle("/metrics", promHandler)
 		Logger.Info(http.ListenAndServe(p.cfg.ListenAddr, nil))