truenas · themylogin · Mar 12, 2026
diff --git a/src/middlewared/middlewared/main.py b/src/middlewared/middlewared/main.py
@@ -1927,10 +1927,21 @@ def main():
 
     setup_logging('middleware', args.debug_level, args.log_handler)
 
-    # zettarepl runs in a child process via multiprocessing. We use 'spawn' instead of the Linux
-    # default 'fork' so that the child does not inherit the parent's logging handlers (which would
-    # cause duplicate log output) or its asyncio SIGTERM handler (which would make the child
-    # unkillable without SIGKILL).
+    # Use 'spawn' instead of the Linux default 'fork' for multiprocessing. Using 'fork' in
+    # multithreaded processes is highly discouraged by the Python docs and may lead to deadlocks.
+    #
+    # middlewared is multithreaded (asyncio event loop, ThreadPoolExecutor workers, etc.). With 'fork', only
+    # the calling thread is duplicated in the child — but all mutexes are copied in their current
+    # state. Locks held by other threads at the moment of fork become permanently locked in the
+    # child (the owning threads don't exist there), causing deadlocks. For example, `disk.retaste`
+    # uses multiprocessing.Pool whose forked workers can deadlock and never exit, hanging
+    # `os.waitpid` forever during pool cleanup.
+    #
+    # Forked children also inherit the parent's signal handlers and logging handlers. This causes
+    # zettarepl (which runs as a multiprocessing child) to inherit the asyncio SIGTERM handler
+    # (making it unkillable without SIGKILL) and duplicate log output.
+    #
+    # 'spawn' starts a fresh Python interpreter via fork+exec, avoiding all inherited state issues.
     multiprocessing.set_start_method('spawn')
 
     middleware = Middleware(

diff --git a/src/middlewared/middlewared/plugins/disk_/retaste.py b/src/middlewared/middlewared/plugins/disk_/retaste.py
@@ -41,7 +41,7 @@ def retaste_disks_impl(disk_serials: set = None):
         errors = m.dict()
         with multiprocessing.Pool() as p:
             # we use processes so that these operations are truly
-            # "parrallel" (side-step the GIL) since we have systems
+            # "parallel" (side-step the GIL) since we have systems
             # with 1k+ disks. Since this runs, potentially, on failover
             # event we need to squeeze out every bit of perf we can get
             p.starmap(taste_it, [(disk, errors) for disk in disks])

diff --git a/src/middlewared/middlewared/plugins/failover_/event.py b/src/middlewared/middlewared/plugins/failover_/event.py
@@ -605,9 +605,9 @@ def vrrp_master(self, job, fobj, ifname, event):
             try:
                 logger.info('Retasting disks on standby node')
                 self.run_call('failover.call_remote', 'disk.retaste', [], {'raise_connect_error': False})
-                logger.info('Done retasting disks on standby node')
+                logger.info('Done scheduling retasting disks on standby node')
             except Exception:
-                logger.exception('Unexpected failure retasting disks on standby node')
+                logger.exception('Unexpected failure scheduling retasting disks on standby node')
 
         # setup the zpool cachefile  TODO: see comment below about cachefile usage
         # self.run_call('failover.zpool.cachefile.setup', 'MASTER')
@@ -1063,7 +1063,7 @@ def vrrp_backup(self, job, fobj, ifname, event):
 
         logger.info('Retasting disks (if required)')
         self.run_call('disk.retaste')
-        logger.info('Done retasting disks (if required)')
+        logger.info('Done scheduling retasting disks (if required)')
 
         logger.info('Activating directory services')
         try: