6
6
"math"
7
7
"math/rand"
8
8
"net"
9
+ "strings"
9
10
"sync/atomic"
10
11
"time"
11
12
@@ -242,6 +243,21 @@ func (m *Memberlist) probeNodeByAddr(addr string) {
242
243
m .probeNode (n )
243
244
}
244
245
246
+ // failedRemote checks the error and decides if it indicates a failure on the
247
+ // other end.
248
+ func failedRemote (err error ) bool {
249
+ switch t := err .(type ) {
250
+ case * net.OpError :
251
+ if strings .HasPrefix (t .Net , "tcp" ) {
252
+ switch t .Op {
253
+ case "dial" , "read" , "write" :
254
+ return true
255
+ }
256
+ }
257
+ }
258
+ return false
259
+ }
260
+
245
261
// probeNode handles a single round of failure checking on a node.
246
262
func (m * Memberlist ) probeNode (node * nodeState ) {
247
263
defer metrics .MeasureSince ([]string {"memberlist" , "probeNode" }, time .Now ())
@@ -272,10 +288,20 @@ func (m *Memberlist) probeNode(node *nodeState) {
272
288
// soon as possible.
273
289
deadline := sent .Add (probeInterval )
274
290
addr := node .Address ()
291
+
292
+ // Arrange for our self-awareness to get updated.
293
+ var awarenessDelta int
294
+ defer func () {
295
+ m .awareness .ApplyDelta (awarenessDelta )
296
+ }()
275
297
if node .State == stateAlive {
276
298
if err := m .encodeAndSendMsg (addr , pingMsg , & ping ); err != nil {
277
299
m .logger .Printf ("[ERR] memberlist: Failed to send ping: %s" , err )
278
- return
300
+ if failedRemote (err ) {
301
+ goto HANDLE_REMOTE_FAILURE
302
+ } else {
303
+ return
304
+ }
279
305
}
280
306
} else {
281
307
var msgs [][]byte
@@ -296,7 +322,11 @@ func (m *Memberlist) probeNode(node *nodeState) {
296
322
compound := makeCompoundMessage (msgs )
297
323
if err := m .rawSendMsgPacket (addr , & node .Node , compound .Bytes ()); err != nil {
298
324
m .logger .Printf ("[ERR] memberlist: Failed to send compound ping and suspect message to %s: %s" , addr , err )
299
- return
325
+ if failedRemote (err ) {
326
+ goto HANDLE_REMOTE_FAILURE
327
+ } else {
328
+ return
329
+ }
300
330
}
301
331
}
302
332
@@ -305,10 +335,7 @@ func (m *Memberlist) probeNode(node *nodeState) {
305
335
// which will improve our health until we get to the failure scenarios
306
336
// at the end of this function, which will alter this delta variable
307
337
// accordingly.
308
- awarenessDelta := - 1
309
- defer func () {
310
- m .awareness .ApplyDelta (awarenessDelta )
311
- }()
338
+ awarenessDelta = - 1
312
339
313
340
// Wait for response or round-trip-time.
314
341
select {
@@ -333,9 +360,10 @@ func (m *Memberlist) probeNode(node *nodeState) {
333
360
// probe interval it will give the TCP fallback more time, which
334
361
// is more active in dealing with lost packets, and it gives more
335
362
// time to wait for indirect acks/nacks.
336
- m .logger .Printf ("[DEBUG] memberlist: Failed ping: %v (timeout reached)" , node .Name )
363
+ m .logger .Printf ("[DEBUG] memberlist: Failed ping: %s (timeout reached)" , node .Name )
337
364
}
338
365
366
+ HANDLE_REMOTE_FAILURE:
339
367
// Get some random live nodes.
340
368
m .nodeLock .RLock ()
341
369
kNodes := kRandomNodes (m .config .IndirectChecks , m .nodes , func (n * nodeState ) bool {
0 commit comments