Fix unit-test hang: use spawn (not fork) for multiprocess jobs

kevalmorabia97 · claude · kevalmorabia97 · commit d4f9a2cbac1d · 2026-06-03T02:51:55.000-07:00
fork in the long-lived pytest process inherits locks held by background threads
(OpenMP / torch intra-op pools), deadlocking the child (e.g. in
dist.init_process_group) and hanging the job. Revert spawn_multiprocess_job to
spawn; the world_size reduction remains the speedup.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/tests/_test_utils/torch/distributed/utils.py b/tests/_test_utils/torch/distributed/utils.py
@@ -53,18 +53,9 @@ def init_process(rank, size, job=None, backend="gloo", port=None):
         job(rank, size)
 
 
-def spawn_multiprocess_job(size, job, backend="gloo", start_method=None):
-    # ``fork`` lets child processes inherit the parent's already-imported torch/modelopt
-    # modules, avoiding a ~12s re-import per process. It is only safe without CUDA (a CUDA
-    # context cannot be forked safely), so default to ``fork`` for CPU/gloo jobs and fall
-    # back to ``spawn`` when a GPU is present or a non-gloo backend is used.
-    if start_method is None:
-        start_method = "fork" if backend == "gloo" and not torch.cuda.is_available() else "spawn"
+def spawn_multiprocess_job(size, job, backend="gloo"):
     port = get_free_port()
-
-    # Use an explicit context instead of ``set_start_method(force=True)`` so we don't mutate
-    # the global multiprocessing state shared with other tests.
-    ctx = mp.get_context(start_method)
+    ctx = mp.get_context("spawn")
     processes = []
     for rank in range(size):
         p = ctx.Process(target=init_process, args=(rank, size, job, backend, port))