Skip to content

Commit 9a9b4c6

Browse files
JasperB-TeamBluedupondje
authored andcommitted
core: reconnect faster to host after reboot
Altered sleepOnReboot to try and reach the host with a certain interval, ServerRebootSleepTime, as waiting time in between tries. When the host is back online after reboot, cancel sleep timeout and continue with operations. Instead of trying to reach the host after the ServerRebootTimeout has been reached, this is the max time the host has to come back online before breaking off operations. Uses the stats it gets from the vdsproxy to check the state of the host and reports back that operations with this host can be continued. Signed-off-by: Jasper Berton <[email protected]>
1 parent 8d33abc commit 9a9b4c6

File tree

4 files changed

+57
-8
lines changed

4 files changed

+57
-8
lines changed

Diff for: backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java

+51-7
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,13 @@
22

33
import java.util.Collections;
44
import java.util.List;
5+
import java.util.concurrent.CancellationException;
6+
import java.util.concurrent.ExecutionException;
7+
import java.util.concurrent.ScheduledFuture;
58
import java.util.concurrent.TimeUnit;
9+
import java.util.concurrent.TimeoutException;
610

11+
import javax.enterprise.concurrent.ManagedScheduledExecutorService;
712
import javax.inject.Inject;
813

914
import org.apache.commons.lang.StringUtils;
@@ -37,14 +42,18 @@
3742
import org.ovirt.engine.core.dao.VdsStaticDao;
3843
import org.ovirt.engine.core.dao.gluster.GlusterDBUtils;
3944
import org.ovirt.engine.core.utils.EngineLocalConfig;
40-
import org.ovirt.engine.core.utils.ThreadUtils;
4145
import org.ovirt.engine.core.utils.lock.EngineLock;
4246
import org.ovirt.engine.core.utils.threadpool.ThreadPoolUtil;
47+
import org.ovirt.engine.core.utils.threadpool.ThreadPools;
4348
import org.ovirt.engine.core.vdsbroker.ResourceManager;
49+
import org.ovirt.engine.core.vdsbroker.vdsbroker.IVdsServer;
50+
import org.ovirt.engine.core.vdsbroker.vdsbroker.VDSInfoReturn;
51+
4452

4553
public abstract class VdsCommand<T extends VdsActionParameters> extends CommandBase<T> {
4654

4755
protected String _failureMessage = null;
56+
private ScheduledFuture<?> reachableFuture;
4857

4958
@Inject
5059
protected AuditLogDirector auditLogDirector;
@@ -68,6 +77,9 @@ public abstract class VdsCommand<T extends VdsActionParameters> extends CommandB
6877
private AlertDirector alertDirector;
6978
@Inject
7079
private VdsStaticDao vdsStaticDao;
80+
@Inject
81+
@ThreadPools(ThreadPools.ThreadPoolType.EngineScheduledThreadPool)
82+
private ManagedScheduledExecutorService executor;
7183

7284
/**
7385
* Constructor for command creation when compensation is applied on startup
@@ -112,14 +124,46 @@ protected void runSleepOnReboot(boolean synchronous, final VDSStatus status) {
112124
}
113125
}
114126

127+
/**
128+
* Enables timeout on the thread until max timeout time is exceeded or a connection is made with the rebooting device
129+
*/
115130
private void sleepOnReboot(final VDSStatus status) {
116-
int sleepTimeInSec = Config.<Integer> getValue(ConfigValues.ServerRebootTimeout);
117-
log.info("Waiting {} seconds, for server to finish reboot process.",
118-
sleepTimeInSec);
119131
resourceManager.getVdsManager(getVdsId()).setInServerRebootTimeout(true);
120-
ThreadUtils.sleep(TimeUnit.SECONDS.toMillis(sleepTimeInSec));
121-
resourceManager.getVdsManager(getVdsId()).setInServerRebootTimeout(false);
122-
setVdsStatus(status);
132+
int serverRebootMax = Config.<Integer> getValue(ConfigValues.ServerRebootTimeout);
133+
int retryTime = Config.<Integer> getValue(ConfigValues.ServerRebootSleepTime);
134+
try {
135+
reachableFuture
136+
= executor.scheduleAtFixedRate(() -> isReachable(), retryTime, retryTime, TimeUnit.SECONDS);
137+
reachableFuture.get(serverRebootMax, TimeUnit.SECONDS);
138+
} catch (InterruptedException e) {
139+
log.info("Trying to reconnect with host {} after reboot failed due to {}", getVdsId(), e.toString());
140+
} catch (ExecutionException e) {
141+
log.info("Problem during execution of reconnection with host {} after reboot due to {}", getVdsId(), e.toString());
142+
} catch (TimeoutException e) {
143+
log.info("Unable to connect to host {} after {} seconds", getVdsId(), serverRebootMax);
144+
} catch (CancellationException e) {
145+
log.info("Future cancelled due to ability to connect to host {} after reboot.", getVdsId());
146+
} finally {
147+
resourceManager.getVdsManager(getVdsId()).setInServerRebootTimeout(false);
148+
setVdsStatus(status);
149+
}
150+
}
151+
152+
/**
153+
* Checks if the host is ready to reconnect
154+
* if the status equals 0 it means the vds is done and ready to reconnect, so the thread can be interrupted
155+
*/
156+
private void isReachable() {
157+
try {
158+
IVdsServer serv = resourceManager.getVdsManager(getVdsId()).getVdsProxy();
159+
VDSInfoReturn info = serv.getVdsStats();
160+
log.info("Status of host {} is {}", getVdsId(), info.status.toString());
161+
if (info.status.code == 0) {
162+
reachableFuture.cancel(false);
163+
}
164+
} catch (Throwable t) {
165+
log.error("Error encountered {}", t.toString());
166+
}
123167
}
124168

125169
/**

Diff for: backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java

+2
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ public enum ConfigValues {
234234
@Reloadable
235235
@TypeConverterAttribute(Integer.class)
236236
ServerRebootTimeout,
237+
@TypeConverterAttribute(Integer.class)
238+
ServerRebootSleepTime,
237239
@Reloadable
238240
@TypeConverterAttribute(Integer.class)
239241
VmGracefulShutdownTimeout,

Diff for: packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql

+1
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,7 @@ select fn_db_add_config_value('ServerCPUList',
622622
'4.8');
623623

624624
select fn_db_add_config_value('ServerRebootTimeout','600','general');
625+
select fn_db_add_config_value('ServerRebootSleepTime','30','general');
625626
select fn_db_add_config_value('SetupNetworksPollingTimeout','3','general');
626627
select fn_db_add_config_value('SignCertTimeoutInSeconds','30','general');
627628
--Handling Script name for signing

Diff for: packaging/etc/engine-config/engine-config.properties

+3-1
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,10 @@ SANWipeAfterDelete.description="Initializing disk image is more secure but it is
116116
SANWipeAfterDelete.validValues=true,false
117117
SearchResultsLimit.description="Max Quantity of Search Results"
118118
SearchResultsLimit.type=Integer
119-
ServerRebootTimeout.description="Host Reboot Timeout (in seconds)"
119+
ServerRebootTimeout.description="Max Host Reboot Timeout (in seconds)"
120120
ServerRebootTimeout.type=Integer
121+
ServerRebootSleepTime.description="Interval between each try to connect to host while in reboot (in seconds)"
122+
ServerRebootSleepTime=Integer
121123
ConsoleReleaseCursorKeys.description="Keyboard keys combination that causes the mouse cursor to be released from its grab on console client window"
122124
SpiceSecureChannels.description="SPICE Secure Channels"
123125
SpiceSecureChannels.type=StringMultiple

0 commit comments

Comments
 (0)