Skip to content

server: prevent duplicate HA works and alerts #10624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@
import javax.inject.Inject;
import javax.naming.ConfigurationException;

import com.cloud.configuration.Config;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.db.GlobalLock;
import org.apache.cloudstack.agent.lb.IndirectAgentLB;
import org.apache.cloudstack.ca.CAManager;
import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService;
Expand All @@ -54,6 +51,7 @@
import org.apache.cloudstack.utils.identity.ManagementServerNode;
import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.MDC;

Expand Down Expand Up @@ -82,6 +80,7 @@
import com.cloud.agent.transport.Request;
import com.cloud.agent.transport.Response;
import com.cloud.alert.AlertManager;
import com.cloud.configuration.Config;
import com.cloud.configuration.ManagementServiceConfiguration;
import com.cloud.dc.ClusterVO;
import com.cloud.dc.DataCenterVO;
Expand All @@ -105,11 +104,13 @@
import com.cloud.resource.ResourceManager;
import com.cloud.resource.ResourceState;
import com.cloud.resource.ServerResource;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.Pair;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.concurrency.NamedThreadFactory;
import com.cloud.utils.db.DB;
import com.cloud.utils.db.EntityManager;
import com.cloud.utils.db.GlobalLock;
import com.cloud.utils.db.QueryBuilder;
import com.cloud.utils.db.SearchCriteria.Op;
import com.cloud.utils.db.TransactionLegacy;
Expand All @@ -124,7 +125,6 @@
import com.cloud.utils.nio.NioServer;
import com.cloud.utils.nio.Task;
import com.cloud.utils.time.InaccurateClock;
import org.apache.commons.lang3.StringUtils;

/**
* Implementation of the Agent Manager. This class controls the connection to the agents.
Expand Down Expand Up @@ -208,6 +208,11 @@
protected final ConfigKey<Boolean> CheckTxnBeforeSending = new ConfigKey<Boolean>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false",
"This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true);

public static final List<Host.Type> HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList(
Host.Type.SecondaryStorage,
Host.Type.ConsoleProxy
);

@Override
public boolean configure(final String name, final Map<String, Object> params) throws ConfigurationException {

Expand Down Expand Up @@ -901,7 +906,10 @@
if (determinedState == Status.Down) {
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
s_logger.error(message);
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
if (Status.Down.equals(host.getStatus())) {
s_logger.debug(String.format("Skipping sending alert for %s as it already in %s state",
host, host.getStatus()));

Check warning on line 911 in engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java#L910-L911

Added lines #L910 - L911 were not covered by tests
} else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) {
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
}
event = Status.Event.HostDown;
Expand Down
35 changes: 28 additions & 7 deletions server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
Expand All @@ -41,6 +42,7 @@
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;

Expand Down Expand Up @@ -71,7 +73,6 @@
import com.cloud.network.VpcVirtualNetworkApplianceService;
import com.cloud.resource.ResourceManager;
import com.cloud.server.ManagementServer;
import com.cloud.service.ServiceOfferingVO;
import com.cloud.service.dao.ServiceOfferingDao;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.storage.StorageManager;
Expand Down Expand Up @@ -223,6 +224,18 @@
long _timeBetweenCleanups;
String _haTag = null;

private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
Optional<HaWorkVO> item = pendingHaWorks.stream()
.filter(h -> h.getInstanceId() == vm.getId())
.reduce((first, second) -> second);
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
!item.get().canScheduleNew(_timeBetweenFailures))) {
s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
return true;

Check warning on line 234 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L233-L234

Added lines #L233 - L234 were not covered by tests
}
return false;
}

protected HighAvailabilityManagerImpl() {
}

Expand Down Expand Up @@ -265,36 +278,44 @@
s_logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());

final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId());
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());

// send an email alert that the host is down
StringBuilder sb = null;
List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>();
if ((vms != null) && !vms.isEmpty()) {
int skippedHAVms = 0;
if (CollectionUtils.isNotEmpty(vms)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this block could go in its own method.

sb = new StringBuilder();
sb.append(" Starting HA on the following VMs:");
// collect list of vm names for the alert email
for (int i = 0; i < vms.size(); i++) {
VMInstanceVO vm = vms.get(i);
for (VMInstanceVO vm : vms) {
if (vmHasPendingHAJob(pendingHaWorks, vm)) {
skippedHAVms++;
continue;

Check warning on line 295 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L294-L295

Added lines #L294 - L295 were not covered by tests
}
if (vm.getType() == VirtualMachine.Type.User) {
reorderedVMList.add(vm);
} else {
reorderedVMList.add(0, vm);
}
if (vm.isHaEnabled()) {
sb.append(" " + vm.getHostName());
sb.append(" ").append(vm.getHostName());
}
}
}

if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) {
s_logger.debug(String.format(

Check warning on line 308 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L308

Added line #L308 was not covered by tests
"Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host));
return;

Check warning on line 310 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L310

Added line #L310 was not covered by tests
}
// send an email alert that the host is down, include VMs
HostPodVO podVO = _podDao.findById(host.getPodId());
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc,
"Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : ""));

for (VMInstanceVO vm : reorderedVMList) {
ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId());
if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) {
if (s_logger.isDebugEnabled()){
s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
List<HaWorkVO> listPendingHaWorkForVm(long vmId);

List<HaWorkVO> listPendingMigrationsForVm(long vmId);

List<HaWorkVO> listPendingHAWorkForHost(long hostId);
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import java.util.Date;
import java.util.List;


import org.apache.log4j.Logger;
import org.springframework.stereotype.Component;

Expand Down Expand Up @@ -260,4 +259,17 @@

return update(vo, sc);
}

@Override
public List<HaWorkVO> listPendingHAWorkForHost(long hostId) {
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
sb.and("hostId", sb.entity().getHostId(), Op.EQ);
sb.and("type", sb.entity().getWorkType(), Op.EQ);
sb.and("step", sb.entity().getStep(), Op.NIN);
SearchCriteria<HaWorkVO> sc = sb.create();
sc.setParameters("hostId", hostId);
sc.setParameters("type", WorkType.HA);
sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error);
return listBy(sc);
}

Check warning on line 274 in server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java#L264-L274

Added lines #L264 - L274 were not covered by tests
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
import com.cloud.network.VpcVirtualNetworkApplianceService;
import com.cloud.resource.ResourceManager;
import com.cloud.server.ManagementServer;
import com.cloud.service.ServiceOfferingVO;
import com.cloud.service.dao.ServiceOfferingDao;
import com.cloud.storage.StorageManager;
import com.cloud.storage.dao.GuestOSCategoryDao;
Expand Down Expand Up @@ -214,7 +213,6 @@ public void scheduleRestartForVmsOnHostNonEmptyVMList() {
Mockito.when(_dcDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(DataCenterVO.class));
Mockito.when(_haDao.findPreviousHA(Mockito.anyLong())).thenReturn(Arrays.asList(Mockito.mock(HaWorkVO.class)));
Mockito.when(_haDao.persist((HaWorkVO)Mockito.anyObject())).thenReturn(Mockito.mock(HaWorkVO.class));
Mockito.when(_serviceOfferingDao.findById(vm1.getServiceOfferingId())).thenReturn(Mockito.mock(ServiceOfferingVO.class));

highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
}
Expand Down
Loading