Skip to content

Commit ea0d446

Browse files
Improve health check responsiveness
1 parent b0f3ba0 commit ea0d446

File tree

1 file changed

+19
-12
lines changed

1 file changed

+19
-12
lines changed

multiapps-controller-core/src/main/java/org/cloudfoundry/multiapps/controller/core/application/health/ApplicationHealthCalculator.java

+19-12
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
import java.util.concurrent.TimeoutException;
1616
import java.util.function.Consumer;
1717
import java.util.function.Supplier;
18-
18+
import jakarta.inject.Inject;
19+
import jakarta.inject.Named;
1920
import org.cloudfoundry.multiapps.common.SLException;
2021
import org.cloudfoundry.multiapps.controller.client.util.CheckedSupplier;
2122
import org.cloudfoundry.multiapps.controller.client.util.ResilientOperationExecutor;
@@ -35,17 +36,14 @@
3536
import org.springframework.http.HttpStatus;
3637
import org.springframework.http.ResponseEntity;
3738

38-
import jakarta.inject.Inject;
39-
import jakarta.inject.Named;
40-
4139
@Named
4240
public class ApplicationHealthCalculator {
4341

4442
private static final Logger LOGGER = LoggerFactory.getLogger(ApplicationHealthCalculator.class);
4543

4644
private static final int UPDATE_HEALTH_CHECK_STATUS_PERIOD_IN_SECONDS = 10;
4745
private static final int SINGLE_TASK_TIMEOUT_IN_SECONDS = 70; // timeout is set to 70 so it is higher than the DB connection acquisition
48-
// timeout
46+
// timeout
4947
private static final int TOTAL_TASK_TIMEOUT_IN_SECONDS = 3 * SINGLE_TASK_TIMEOUT_IN_SECONDS;
5048

5149
private final ObjectStoreFileStorage objectStoreFileStorage;
@@ -54,19 +52,20 @@ public class ApplicationHealthCalculator {
5452
private final DatabaseMonitoringService databaseMonitoringService;
5553
private final DatabaseWaitingLocksAnalyzer databaseWaitingLocksAnalyzer;
5654

57-
private final CachedObject<Boolean> objectStoreFileStorageHealthCache = new CachedObject<>(Duration.ofSeconds(TOTAL_TASK_TIMEOUT_IN_SECONDS));
55+
private final CachedObject<Boolean> objectStoreFileStorageHealthCache = new CachedObject<>(
56+
Duration.ofSeconds(TOTAL_TASK_TIMEOUT_IN_SECONDS));
5857
private final CachedObject<Boolean> dbHealthServiceCache = new CachedObject<>(Duration.ofSeconds(TOTAL_TASK_TIMEOUT_IN_SECONDS));
5958
private final CachedObject<Boolean> hasIncreasedLocksCache = new CachedObject<>(false,
6059
Duration.ofSeconds(TOTAL_TASK_TIMEOUT_IN_SECONDS));
6160
private final ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor();
6261
private final ExecutorService taskExecutor = new ThreadPoolExecutor(3,
63-
3,
62+
9,
6463
0L,
6564
TimeUnit.MILLISECONDS,
6665
new SynchronousQueue<>(),
6766
new ThreadPoolExecutor.AbortPolicy());
6867
private final ExecutorService timeoutExecutor = new ThreadPoolExecutor(3,
69-
3,
68+
9,
7069
0L,
7170
TimeUnit.MILLISECONDS,
7271
new SynchronousQueue<>(),
@@ -120,11 +119,13 @@ private void executeFuture(Future<Boolean> future, Consumer<Boolean> consumer, b
120119
Thread.currentThread()
121120
.interrupt();
122121
LOGGER.error(Messages.THREAD_WAS_INTERRUPTED_WHILE_WAITING_FOR_THE_RESULT_OF_A_FUTURE, e);
122+
future.cancel(true);
123123
consumer.accept(onErrorValue);
124124
} catch (Exception e) {
125125
LOGGER.error(MessageFormat.format(Messages.ERROR_OCCURRED_DURING_HEALTH_CHECKING_FOR_INSTANCE_0_MESSAGE_1,
126126
applicationConfiguration.getApplicationInstanceIndex(), errorMessage),
127127
e);
128+
future.cancel(true);
128129
consumer.accept(onErrorValue);
129130
}
130131
}
@@ -150,13 +151,16 @@ public ResponseEntity<ApplicationHealthResult> calculateApplicationHealth() {
150151
}
151152
boolean hasIncreasedDbLocks = hasIncreasedLocksCache.getOrRefresh(() -> true);
152153
if (hasIncreasedDbLocks) {
153-
LOGGER.warn(MessageFormat.format(Messages.DETECTED_INCREASED_NUMBER_OF_PROCESSES_WAITING_FOR_LOCKS_FOR_INSTANCE_0_GETTING_THE_LOCKS,
154-
applicationConfiguration.getApplicationInstanceIndex()));
155-
long countOfProcessesWaitingForLocks = resilientOperationExecutor.execute((Supplier<Long>) () -> databaseMonitoringService.getProcessesWaitingForLocks(ApplicationInstanceNameUtil.buildApplicationInstanceTemplate(applicationConfiguration)));
154+
LOGGER.warn(
155+
MessageFormat.format(Messages.DETECTED_INCREASED_NUMBER_OF_PROCESSES_WAITING_FOR_LOCKS_FOR_INSTANCE_0_GETTING_THE_LOCKS,
156+
applicationConfiguration.getApplicationInstanceIndex()));
157+
long countOfProcessesWaitingForLocks = resilientOperationExecutor.execute(
158+
(Supplier<Long>) () -> databaseMonitoringService.getProcessesWaitingForLocks(
159+
ApplicationInstanceNameUtil.buildApplicationInstanceTemplate(applicationConfiguration)));
156160
LOGGER.warn(MessageFormat.format(Messages.DETECTED_INCREASED_NUMBER_OF_PROCESSES_WAITING_FOR_LOCKS_FOR_INSTANCE,
157161
countOfProcessesWaitingForLocks, applicationConfiguration.getApplicationInstanceIndex()));
158162
return ResponseEntity.ok(ImmutableApplicationHealthResult.builder() // TODO: Make this return 503 instead of 200 when the
159-
// detection is trustworthy
163+
// detection is trustworthy
160164
.status(ApplicationHealthResult.Status.DOWN)
161165
.hasIncreasedLocks(true)
162166
.countOfProcessesWaitingForLocks(countOfProcessesWaitingForLocks)
@@ -194,6 +198,7 @@ private boolean testObjectStoreConnectionWithTimeout() throws ExecutionException
194198
LOGGER.debug(Messages.CHECKING_OBJECT_STORE_HEALTH);
195199
return future.get(SINGLE_TASK_TIMEOUT_IN_SECONDS, TimeUnit.SECONDS);
196200
} catch (TimeoutException e) {
201+
future.cancel(true);
197202
throw new SLException(e, Messages.TIMEOUT_WHILE_CHECKING_OBJECT_STORE_HEALTH);
198203
}
199204
}
@@ -219,6 +224,7 @@ private boolean testDatabaseConnectionWithTimeout() throws ExecutionException, I
219224
LOGGER.debug(Messages.CHECKING_DATABASE_HEALTH);
220225
return future.get(SINGLE_TASK_TIMEOUT_IN_SECONDS, TimeUnit.SECONDS);
221226
} catch (TimeoutException e) {
227+
future.cancel(true);
222228
throw new SLException(e, Messages.TIMEOUT_WHILE_CHECKING_DATABASE_HEALTH);
223229
}
224230
}
@@ -229,6 +235,7 @@ private boolean checkForIncreasedLocksWithTimeout() throws ExecutionException, I
229235
LOGGER.debug(Messages.CHECKING_FOR_INCREASED_LOCKS);
230236
return future.get(SINGLE_TASK_TIMEOUT_IN_SECONDS, TimeUnit.SECONDS);
231237
} catch (TimeoutException e) {
238+
future.cancel(true);
232239
throw new SLException(e, Messages.TIMEOUT_WHILE_CHECKING_FOR_INCREASED_LOCKS);
233240
}
234241
}

0 commit comments

Comments
 (0)