Skip to content

Support to enable/disable VM High Availability manager #10118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
*/
public interface HighAvailabilityManager extends Manager {

public ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
"Force High-Availability to happen even if the VM says no.", true, Cluster);

ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
Expand Down Expand Up @@ -112,7 +112,7 @@ enum Step {

void cancelDestroy(VMInstanceVO vm, Long hostId);

void scheduleDestroy(VMInstanceVO vm, long hostId);
boolean scheduleDestroy(VMInstanceVO vm, long hostId);

/**
* Schedule restarts for all vms running on the host.
Expand Down Expand Up @@ -143,7 +143,7 @@ enum Step {
* @param host host the virtual machine is on.
* @param type which type of stop is requested.
*/
void scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);

void cancelScheduledMigrations(HostVO host);

Expand Down
129 changes: 115 additions & 14 deletions server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// under the License.
package com.cloud.ha;

import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;

import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
Expand Down Expand Up @@ -121,6 +123,16 @@
"Total number of attempts for trying migration of a VM.",
true, ConfigKey.Scope.Global);

public static ConfigKey<Boolean> VmHaEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.enabled", "true",
"Enable/Disable VM High Availability manager, it is enabled by default."
+ " When enabled, the VM HA WorkItems (for VM Stop, Restart, Migration, Destroy) can be created and the scheduled items are executed; and"
+ " When disabled, new VM HA WorkItems are not allowed and the scheduled items are retried until max retries configured at 'vm.ha.migration.max.retries'"
+ " (executed in case HA is re-enabled during retry attempts), and then purged after 'time.between.failures' by the cleanup thread that runs"
+ " regularly at 'time.between.cleanup'", true, Zone);

protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);

WorkerThread[] _workers;
boolean _stopped;
long _timeToSleep;
Expand Down Expand Up @@ -185,7 +197,6 @@
_haPlanners = haPlanners;
}


@Inject
AgentManager _agentMgr;
@Inject
Expand Down Expand Up @@ -231,6 +242,15 @@
return Status.Alert;
}

if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
String message = String.format("Unable to investigate the host %s (%d), VM high availability manager is disabled.", host.getName(), hostId);
if (logger.isDebugEnabled()) {
logger.debug(message);

Check warning on line 248 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L248

Added line #L248 was not covered by tests
}
sendHostAlert(host, message);
return Status.Alert;
}

Status hostState = null;
for (Investigator investigator : investigators) {
hostState = investigator.isAgentAlive(host);
Expand Down Expand Up @@ -260,6 +280,15 @@
return;
}

if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
String message = String.format("Unable to schedule restart for VMs on host %s (%d), VM high availability manager is disabled.", host.getName(), host.getId());
if (logger.isDebugEnabled()) {
logger.debug(message);

Check warning on line 286 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L286

Added line #L286 was not covered by tests
}
sendHostAlert(host, message);
return;
}

logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());

final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
Expand Down Expand Up @@ -314,12 +343,21 @@
}

@Override
public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);

if (_haDao.hasBeenScheduled(vm.getId(), type)) {
logger.info("There's already a job scheduled to stop " + vm);
return;
return false;

Check warning on line 351 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L351

Added line #L351 was not covered by tests
}

if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule stop for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
if (logger.isDebugEnabled()) {
logger.debug(message);

Check warning on line 357 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L357

Added line #L357 was not covered by tests
}
sendVMAlert(vm, message);
return false;
}

HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
Expand All @@ -328,6 +366,7 @@
logger.debug("Scheduled " + work);
}
wakeupWorkers();
return true;
}

protected void wakeupWorkers() {
Expand All @@ -339,17 +378,37 @@

@Override
public boolean scheduleMigration(final VMInstanceVO vm) {
if (vm.getHostId() != null) {
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
_haDao.persist(work);
logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
wakeupWorkers();
if (vm.getHostId() == null) {
return false;

Check warning on line 382 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L382

Added line #L382 was not covered by tests
}

if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule migration for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), vm.getHostId());
if (logger.isDebugEnabled()) {
logger.debug(message);

Check warning on line 388 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L388

Added line #L388 was not covered by tests
}
sendVMAlert(vm, message);
return false;
}

final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
_haDao.persist(work);
logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
wakeupWorkers();
return true;
}

@Override
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
if (logger.isDebugEnabled()) {
logger.debug(message);

Check warning on line 406 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L406

Added line #L406 was not covered by tests
}
sendVMAlert(vm, message);
return;
}

logger.debug("HA schedule restart");
Long hostId = vm.getHostId();
if (hostId == null) {
Expand Down Expand Up @@ -440,7 +499,6 @@
}

wakeupWorkers();

}

private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
Expand Down Expand Up @@ -737,13 +795,23 @@
}

@Override
public void scheduleDestroy(VMInstanceVO vm, long hostId) {
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
if (logger.isDebugEnabled()) {
logger.debug(message);

Check warning on line 802 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L802

Added line #L802 was not covered by tests
}
sendVMAlert(vm, message);
return false;
}

final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work.toString());
}
wakeupWorkers();
return true;
}

@Override
Expand Down Expand Up @@ -892,7 +960,17 @@

private void processWork(final HaWorkVO work) {
final WorkType wt = work.getWorkType();
final VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
try {
if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) {
if (logger.isDebugEnabled()) {
logger.debug(String.format("VM high availability manager is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId()));

Check warning on line 967 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L967

Added line #L967 was not covered by tests
}
long nextTime = getRescheduleTime(wt);
rescheduleWork(work, nextTime);
return;

Check warning on line 971 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L969-L971

Added lines #L969 - L971 were not covered by tests
}

Long nextTime = null;
if (wt == WorkType.Migration) {
nextTime = migrate(work);
Expand Down Expand Up @@ -921,9 +999,10 @@

// if restart failed in the middle due to exception, VM state may has been changed
// recapture into the HA worker so that it can really continue in it next turn
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
work.setUpdateTime(vm.getUpdated());
work.setPreviousState(vm.getState());
if (vm != null) {
work.setUpdateTime(vm.getUpdated());
work.setPreviousState(vm.getState());

Check warning on line 1004 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1003-L1004

Added lines #L1003 - L1004 were not covered by tests
}
} finally {
if (!Step.Done.equals(work.getStep())) {
if (work.getTimesTried() >= _maxRetries) {
Expand Down Expand Up @@ -1128,11 +1207,33 @@
public ConfigKey<?>[] getConfigKeys() {
return new ConfigKey[] {TimeBetweenCleanup, MigrationMaxRetries, TimeToSleep, TimeBetweenFailures,
StopRetryInterval, RestartRetryInterval, MigrateRetryInterval, InvestigateRetryInterval,
HAWorkers, ForceHA, KvmHAFenceHostIfHeartbeatFailsOnStorage};
HAWorkers, ForceHA, VmHaEnabled, VmHaAlertsEnabled, KvmHAFenceHostIfHeartbeatFailsOnStorage};
}

@Override
public int expungeWorkItemsByVmList(List<Long> vmIds, Long batchSize) {
return _haDao.expungeByVmList(vmIds, batchSize);
}

private void sendVMAlert(VMInstanceVO vm, String message) {
if (vm == null || !VmHaAlertsEnabled.valueIn(vm.getDataCenterId())) {
return;

Check warning on line 1220 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1220

Added line #L1220 was not covered by tests
}
AlertManager.AlertType alertType = AlertManager.AlertType.ALERT_TYPE_USERVM;
if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
alertType = AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER;

Check warning on line 1224 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1224

Added line #L1224 was not covered by tests
} else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
alertType = AlertManager.AlertType.ALERT_TYPE_CONSOLE_PROXY;

Check warning on line 1226 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1226

Added line #L1226 was not covered by tests
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;

Check warning on line 1228 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1228

Added line #L1228 was not covered by tests
}
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
}

private void sendHostAlert(HostVO host, String message) {
if (host == null || !VmHaAlertsEnabled.valueIn(host.getDataCenterId())) {
return;

Check warning on line 1235 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1235

Added line #L1235 was not covered by tests
}
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), message, message);
}
}
10 changes: 7 additions & 3 deletions server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import com.cloud.cpu.CPU;
import com.cloud.exception.StorageConflictException;
import com.cloud.exception.StorageUnavailableException;
import com.cloud.ha.HighAvailabilityManagerImpl;
import com.cloud.host.HostTagVO;
import com.cloud.storage.Volume;
import com.cloud.storage.VolumeVO;
Expand Down Expand Up @@ -1363,6 +1364,11 @@
throw new CloudRuntimeException("Cannot perform maintain when resource state is " + hostState + ", hostId = " + hostId);
}

final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);

Check warning on line 1367 in server/src/main/java/com/cloud/resource/ResourceManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/resource/ResourceManagerImpl.java#L1367

Added line #L1367 was not covered by tests
if (CollectionUtils.isNotEmpty(vms) && !HighAvailabilityManagerImpl.VmHaEnabled.valueIn(host.getDataCenterId())) {
throw new CloudRuntimeException(String.format("Cannot perform maintain for the host %s (%d) as there are running VMs on it and VM high availability manager is disabled", host.getName(), hostId));

Check warning on line 1369 in server/src/main/java/com/cloud/resource/ResourceManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/resource/ResourceManagerImpl.java#L1369

Added line #L1369 was not covered by tests
}

final MaintainAnswer answer = (MaintainAnswer)_agentMgr.easySend(hostId, new MaintainCommand());
if (answer == null || !answer.getResult()) {
logger.warn("Unable to send MaintainCommand to host: " + hostId);
Expand All @@ -1382,8 +1388,6 @@

/* TODO: move below to listener */
if (host.getType() == Host.Type.Routing) {

final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);
if (vms.size() == 0) {
return true;
}
Expand Down Expand Up @@ -2841,7 +2845,7 @@
logger.debug("Cannot transmit host " + host.getId() + " to Disabled state", e);
}
for (final VMInstanceVO vm : vms) {
if ((! HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
if ((!HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
logger.debug(String.format("Stopping %s as a part of hostDelete for %s",vm, host));
try {
_haMgr.scheduleStop(vm, host.getId(), WorkType.Stop);
Expand Down
Loading