Skip to content

Commit ee28652

Browse files
goasChrispatrickelectric
authored andcommitted
Restart autopilot on consecutive heartbeat failure
If we get no heartbeat from the subprocess, there is no way to ensure it is running safely. Killing ardupilot will now agressively prune ardupilot processes if it cannot stop the one it is attached to. Prior to this change, if we could not stop our own process, we would be in limbo, with no way to recover.
1 parent 289eafb commit ee28652

File tree

1 file changed

+36
-2
lines changed

1 file changed

+36
-2
lines changed

core/services/ardupilot_manager/autopilot_manager.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ async def setup(self) -> None:
149149
self.settings.firmware_folder, self.settings.defaults_folder, self.settings.user_firmware_folder
150150
)
151151
self.vehicle_manager = VehicleManager()
152+
self._heartbeat_fail_count = 0 # Consecutive heartbeat failures
153+
self._max_heartbeat_failures = 10 # Threshold for restarting Ardupilot after consecutive heartbeat failures
152154

153155
self.should_be_running = False
154156
self.remove_old_logs()
@@ -200,6 +202,32 @@ async def auto_restart_ardupilot(self) -> None:
200202
await self.start_ardupilot()
201203
except Exception as error:
202204
logger.warning(f"Could not start Ardupilot: {error}")
205+
206+
# Monitor MAVLink heartbeat while autopilot is supposed to be running
207+
if self.should_be_running and self.is_running():
208+
try:
209+
alive = await self.vehicle_manager.is_heart_beating()
210+
if alive:
211+
self._heartbeat_fail_count = 0
212+
else:
213+
self._heartbeat_fail_count += 1
214+
logger.warning(
215+
f"Heartbeat check False ({self._heartbeat_fail_count}/{self._max_heartbeat_failures})"
216+
)
217+
except Exception as error:
218+
self._heartbeat_fail_count += 1
219+
logger.warning(
220+
f"heartbeat check failed ({self._heartbeat_fail_count}/{self._max_heartbeat_failures}): {error}"
221+
)
222+
223+
if self._heartbeat_fail_count >= self._max_heartbeat_failures:
224+
logger.warning("Consecutive heartbeat failures threshold reached — restarting Ardupilot.")
225+
try:
226+
await self.restart_ardupilot()
227+
except Exception as error:
228+
logger.warning(f"Failed to restart Ardupilot after heartbeat failures: {error}")
229+
self._heartbeat_fail_count = 0
230+
203231
await asyncio.sleep(5.0)
204232

205233
async def start_mavlink_manager_watchdog(self) -> None:
@@ -560,8 +588,14 @@ async def kill_ardupilot(self) -> None:
560588
# TODO: Add shutdown command on HAL_SITL and HAL_LINUX, changing terminate/prune
561589
# logic with a simple "self.vehicle_manager.shutdown_vehicle()"
562590
logger.info("Terminating Ardupilot subprocess.")
563-
await self.terminate_ardupilot_subprocess()
564-
logger.info("Ardupilot subprocess terminated.")
591+
try:
592+
await self.terminate_ardupilot_subprocess()
593+
logger.info("Ardupilot subprocess terminated.")
594+
except AutoPilotProcessKillFail as error:
595+
# If we cannot control the process, we should force kill to get a clean slate
596+
# This would ensure we don't sit with a zombie process, or have lost track of the process
597+
logger.error(f"Failed to terminate Ardupilot subprocess: {error}")
598+
565599
logger.info("Pruning Ardupilot's system processes.")
566600
await self.prune_ardupilot_processes()
567601
logger.info("Ardupilot's system processes pruned.")

0 commit comments

Comments
 (0)