|
55 | 55 | import com.google.common.collect.Iterables; |
56 | 56 | import com.google.common.collect.Sets; |
57 | 57 | import com.google.common.util.concurrent.Uninterruptibles; |
| 58 | +import org.apache.cassandra.exceptions.RequestFailureReason; |
58 | 59 | import org.slf4j.Logger; |
59 | 60 | import org.slf4j.LoggerFactory; |
60 | 61 |
|
@@ -165,6 +166,9 @@ public class Gossiper implements IFailureDetectionEventListener, GossiperMBean, |
165 | 166 | @VisibleForTesting |
166 | 167 | final Set<InetAddressAndPort> liveEndpoints = new ConcurrentSkipListSet<>(); |
167 | 168 |
|
| 169 | + /* Inflight echo requests. */ |
| 170 | + private final Set<InetAddressAndPort> inflightEcho = new ConcurrentSkipListSet<>(); |
| 171 | + |
168 | 172 | /* unreachable member set */ |
169 | 173 | private final Map<InetAddressAndPort, Long> unreachableEndpoints = new ConcurrentHashMap<>(); |
170 | 174 |
|
@@ -729,6 +733,7 @@ public void removeEndpoint(InetAddressAndPort endpoint) |
729 | 733 | return; |
730 | 734 |
|
731 | 735 | liveEndpoints.remove(endpoint); |
| 736 | + inflightEcho.remove(endpoint); |
732 | 737 | unreachableEndpoints.remove(endpoint); |
733 | 738 | MessagingService.instance().versions.reset(endpoint); |
734 | 739 | quarantineEndpoint(endpoint); |
@@ -1430,12 +1435,53 @@ void notifyFailureDetector(InetAddressAndPort endpoint, EndpointState remoteEndp |
1430 | 1435 | private void markAlive(final InetAddressAndPort addr, final EndpointState localState) |
1431 | 1436 | { |
1432 | 1437 | localState.markDead(); |
| 1438 | + if (!inflightEcho.add(addr)) |
| 1439 | + { |
| 1440 | + return; |
| 1441 | + } |
1433 | 1442 |
|
1434 | 1443 | Message<NoPayload> echoMessage = Message.out(ECHO_REQ, noPayload); |
1435 | 1444 | logger.trace("Sending ECHO_REQ to {}", addr); |
1436 | | - RequestCallback echoHandler = msg -> |
| 1445 | + RequestCallback echoHandler = new RequestCallback() |
1437 | 1446 | { |
1438 | | - runInGossipStageBlocking(() -> realMarkAlive(addr, localState)); |
| 1447 | + @Override |
| 1448 | + public void onResponse(Message msg) |
| 1449 | + { |
| 1450 | + // force processing of the echo response onto the gossip stage, as it comes in on the REQUEST_RESPONSE stage |
| 1451 | + runInGossipStageBlocking(() -> { |
| 1452 | + try |
| 1453 | + { |
| 1454 | + EndpointState localStatePtr = endpointStateMap.get(addr); |
| 1455 | + realMarkAlive(addr, localStatePtr); |
| 1456 | + } |
| 1457 | + finally |
| 1458 | + { |
| 1459 | + inflightEcho.remove(addr); |
| 1460 | + } |
| 1461 | + }); |
| 1462 | + } |
| 1463 | + |
| 1464 | + @Override |
| 1465 | + public boolean invokeOnFailure() |
| 1466 | + { |
| 1467 | + return true; |
| 1468 | + } |
| 1469 | + |
| 1470 | + @Override |
| 1471 | + public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) |
| 1472 | + { |
| 1473 | + if (isEnabled()) |
| 1474 | + { |
| 1475 | + logger.trace("Resending ECHO_REQ to {}", addr); |
| 1476 | + Message<NoPayload> echoMessage = Message.out(ECHO_REQ, noPayload); |
| 1477 | + MessagingService.instance().sendWithCallback(echoMessage, addr, this); |
| 1478 | + } |
| 1479 | + else |
| 1480 | + { |
| 1481 | + logger.trace("Failed ECHO_REQ to {}, aborting due to disabled gossip", addr); |
| 1482 | + inflightEcho.remove(addr); |
| 1483 | + } |
| 1484 | + } |
1439 | 1485 | }; |
1440 | 1486 |
|
1441 | 1487 | MessagingService.instance().sendWithCallback(echoMessage, addr, echoHandler); |
@@ -1488,6 +1534,7 @@ public void markDead(InetAddressAndPort addr, EndpointState localState) |
1488 | 1534 | private void silentlyMarkDead(InetAddressAndPort addr, EndpointState localState) |
1489 | 1535 | { |
1490 | 1536 | localState.markDead(); |
| 1537 | + inflightEcho.remove(addr); |
1491 | 1538 | if (!disableEndpointRemoval) |
1492 | 1539 | { |
1493 | 1540 | liveEndpoints.remove(addr); |
|
0 commit comments