Skip to content

Commit f7d4413

Browse files
committed
Fix flaky test testEvacuateWithAddDuringMaintenanceMode
Root cause: The test validates ExternalView (EV) state assignments immediately after isEvacuateFinished() returns true, but there's a timing gap between when isEvacuateFinished() completes (checks CurrentState and messages) and when the controller updates the EV to reflect the new state assignments. The fix adds two stabilization loops: 1. After exiting maintenance mode (lines 1067-1108): Waits up to 30s for the swap to be fully processed. During maintenance mode, EV updates are deferred. After exiting, the controller needs time to compute and propagate the new ideal state where swapOutName is replaced by swapInName. 2. After evacuation finishes (lines 1125-1159): Waits up to 10s for EV state assignments to stabilize. isEvacuateFinished() returning true doesn't mean the EV has caught up - it only means the current states and messages are empty. Why the test is flaky: The test captures originalEVs before entering maintenance mode. When it validates that the swap was successful (completedSwapIn contains swapInName), it expects swapOutName to be removed from the EV. But in CI environments, the controller may take longer to update the EV after maintenance mode ends and after evacuation completes, causing the validation to fail with state mismatches like: - Expected: {localhost_12934=STANDBY, localhost_12937=LEADER, localhost_12940=STANDBY} - Actual: {localhost_12934=LEADER, localhost_12937=STANDBY, localhost_12940=STANDBY} The stabilization loops ensure we wait for the controller to finish its work before validating.
1 parent 4de69db commit f7d4413

1 file changed

Lines changed: 80 additions & 0 deletions

File tree

helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,49 @@ public void testEvacuateWithAddDuringMaintenanceMode() throws Exception {
10641064

10651065
Assert.assertTrue(_clusterVerifier.verifyByPolling());
10661066

1067+
// Ensure swap is fully processed after maintenance mode ends
1068+
// During maintenance mode, EV updates are deferred. After exiting, we need to wait
1069+
// for the controller to compute and propagate the new ideal state assignment.
1070+
long swapWaitStartTime = System.currentTimeMillis();
1071+
long maxSwapWaitTime = 30000;
1072+
boolean swapProcessed = false;
1073+
while (System.currentTimeMillis() - swapWaitStartTime < maxSwapWaitTime) {
1074+
Map<String, ExternalView> currentEVs = getEVs();
1075+
boolean allReplaced = true;
1076+
for (String resource : originalEVs.keySet()) {
1077+
ExternalView current = currentEVs.get(resource);
1078+
ExternalView original = originalEVs.get(resource);
1079+
if (current == null || original == null) {
1080+
allReplaced = false;
1081+
break;
1082+
}
1083+
for (String partition : original.getPartitionSet()) {
1084+
Map<String, String> currentStateMap = current.getStateMap(partition);
1085+
Map<String, String> originalStateMap = original.getStateMap(partition);
1086+
// Check if swapOut is still in EV with higher state than expected
1087+
if (currentStateMap.containsKey(swapOutName)) {
1088+
String currentState = currentStateMap.get(swapOutName);
1089+
String originalState = originalStateMap.get(swapOutName);
1090+
if (currentState.equals(originalState) && "LEADER".equals(currentState)) {
1091+
// swapOut still has its original (higher) state - not yet replaced
1092+
allReplaced = false;
1093+
break;
1094+
}
1095+
}
1096+
}
1097+
if (!allReplaced) break;
1098+
}
1099+
if (allReplaced) {
1100+
swapProcessed = true;
1101+
LOG.info("Swap processed after {} ms", System.currentTimeMillis() - swapWaitStartTime);
1102+
break;
1103+
}
1104+
Thread.sleep(500);
1105+
}
1106+
if (!swapProcessed) {
1107+
LOG.warn("Swap may not have been fully processed within {} ms", maxSwapWaitTime);
1108+
}
1109+
10671110
// Verify swap-in has the same partitions swap-out had
10681111
boolean swapValid = TestHelper.verify(() ->
10691112
validateEVsCorrect(getEVs(), originalEVs, swapOutToSwapIn,
@@ -1079,6 +1122,43 @@ public void testEvacuateWithAddDuringMaintenanceMode() throws Exception {
10791122
LOG.info("isEvacuateFinished: {}", evacFinished);
10801123
Assert.assertTrue(evacFinished, "Evacuation should be finished");
10811124

1125+
// Wait for EV state assignments to stabilize after evacuation completes
1126+
// isEvacuateFinished checks current states/messages but EV may lag behind
1127+
Map<String, ExternalView> evSnapshot = getEVs();
1128+
int maxWaitTime = 10000;
1129+
int pollInterval = 500;
1130+
long startTime = System.currentTimeMillis();
1131+
boolean stable = false;
1132+
while (System.currentTimeMillis() - startTime < maxWaitTime) {
1133+
Map<String, ExternalView> currentEVs = getEVs();
1134+
boolean allMatch = true;
1135+
for (String resource : evSnapshot.keySet()) {
1136+
ExternalView current = currentEVs.get(resource);
1137+
ExternalView snapshot = evSnapshot.get(resource);
1138+
if (current == null || snapshot == null) {
1139+
allMatch = false;
1140+
break;
1141+
}
1142+
for (String partition : snapshot.getPartitionSet()) {
1143+
if (!current.getStateMap(partition).equals(snapshot.getStateMap(partition))) {
1144+
allMatch = false;
1145+
break;
1146+
}
1147+
}
1148+
if (!allMatch) break;
1149+
}
1150+
if (allMatch) {
1151+
stable = true;
1152+
LOG.info("EV state assignments stabilized after {} ms", System.currentTimeMillis() - startTime);
1153+
break;
1154+
}
1155+
evSnapshot = currentEVs;
1156+
Thread.sleep(pollInterval);
1157+
}
1158+
if (!stable) {
1159+
LOG.warn("EV did not stabilize within {} ms after evacuation", maxWaitTime);
1160+
}
1161+
10821162
// Set evacuate instance to UNKNOWN
10831163
LOG.info("Setting UNKNOWN on evacuate instance");
10841164
_gSetupTool.getClusterManagementTool()

0 commit comments

Comments
 (0)