Pass mp context to strategy (#651)

sjmonson · web-flow · commit 1d579f6a4ad4 · 2026-03-20T16:40:09.000-04:00
## Summary Fixes spawn and forkserver multi-process contexts. ## Details I was hoping that after #647 we could switch to `forkserver` by default. However it turns out that `forkserver` and `spawn` will import the calling processes entrypoint (E.g. `__main__.py`) so we run into the same blocker as #641. However, I was able to confirm that striping every heavy import out of `__main__.py` solves the issue. So we should be good to switch in v0.7.0. On my machine there is about a ~10s overhead for `forkserver` and slightly more for `spawn`, which is not the worst for a default. However, the overhead may be more on other systems: ### `time guidellm benchmark run --profile poisson --rate 5 --data prompt_tokens=128,output_tokens=128 --max-seconds 30 --outputs json` | Context | real | user | sys | | ---------- | --------- | --------- | -------- | | Fork | 0m37.874s | 0m17.356s | 0m1.883s | | Forkserver | 0m47.344s | 0m14.862s | 0m0.860s | | Spawn | 0m49.515s | 1m51.230s | 0m8.915s | ### `time guidellm benchmark run --profile concurrent --rate 400 --data prompt_tokens=128,output_tokens=128 --max-seconds 30 --outputs json` | Context | real | user | sys | | ---------- | --------- | --------- | --------- | | Fork | 0m39.324s | 0m37.602s | 0m5.623s | | Forkserver | 0m49.609s | 0m19.710s | 0m1.311s | | Spawn | 0m50.399s | 2m9.724s | 0m11.374s | ### `time guidellm benchmark run --profile concurrent --rate 400 --data prompt_tokens=128,output_tokens=128 --max-seconds 120 --outputs json` | Context | real | user | sys | | ---------- | --------- | --------- | --------- | | Fork | 2m15.309s | 1m42.911s | 0m15.957s | | Forkserver | 2m25.964s | 0m38.891s | 0m2.802s | | Spawn | 2m27.454s | 3m24.325s | 0m22.531s | ## Test Plan Set `GUIDELLM__MP_CONTEXT_TYPE=forkserver` and confirm benchmarks run. --- - [x] "I certify that all code in this PR is my own, except as noted below." ## Use of AI - [x] Includes AI-assisted code completion - [ ] Includes code generated by an AI application - [ ] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`)
diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
@@ -19,7 +19,8 @@
 import math
 import random
 from abc import abstractmethod
-from multiprocessing import Event, Value, synchronize
+from multiprocessing import synchronize
+from multiprocessing.context import BaseContext
 from multiprocessing.sharedctypes import Synchronized
 from typing import Annotated, ClassVar, Literal, TypeVar
 
@@ -103,7 +104,10 @@ def requests_limit(self) -> PositiveInt | None:
         return None
 
     def init_processes_timings(
-        self, worker_count: PositiveInt, max_concurrency: PositiveInt
+        self,
+        worker_count: PositiveInt,
+        max_concurrency: PositiveInt,
+        mp_context: BaseContext,
     ):
         """
         Initialize shared timing state for multi-process coordination.
@@ -117,9 +121,9 @@ def init_processes_timings(
         self.worker_count = worker_count
         self.max_concurrency = max_concurrency
 
-        self._processes_init_event = Event()
-        self._processes_request_index = Value("i", 0)
-        self._processes_start_time = Value("d", -1.0)
+        self._processes_init_event = mp_context.Event()
+        self._processes_request_index = mp_context.Value("i", 0)
+        self._processes_start_time = mp_context.Value("d", -1.0)
 
     def init_processes_start(self, start_time: float):
         """
@@ -593,7 +597,12 @@ def requests_limit(self) -> PositiveInt | None:
         """
         return self.max_concurrency
 
-    def init_processes_timings(self, worker_count: int, max_concurrency: int):
+    def init_processes_timings(
+        self,
+        worker_count: PositiveInt,
+        max_concurrency: PositiveInt,
+        mp_context: BaseContext,
+    ):
         """
         Initialize Poisson-specific timing state.
 
@@ -603,10 +612,10 @@ def init_processes_timings(self, worker_count: int, max_concurrency: int):
         :param worker_count: Number of worker processes to coordinate
         :param max_concurrency: Maximum number of concurrent requests allowed
         """
-        self._offset = Value("d", -1.0)
+        self._offset = mp_context.Value("d", -1.0)
         # Call base implementation last to avoid
         # setting Event before offset is ready
-        super().init_processes_timings(worker_count, max_concurrency)
+        super().init_processes_timings(worker_count, max_concurrency, mp_context)
 
     def init_processes_start(self, start_time: float):
         """
diff --git a/src/guidellm/scheduler/worker_group.py b/src/guidellm/scheduler/worker_group.py
@@ -221,7 +221,9 @@ async def create_processes(self):
         # Initialize worker processes
         self.processes = []
         self.strategy.init_processes_timings(
-            worker_count=num_processes, max_concurrency=max_conc
+            worker_count=num_processes,
+            max_concurrency=max_conc,
+            mp_context=self.mp_context,
         )
         for rank in range(num_processes):
             # Distribute any remainder across the first N ranks
diff --git a/tests/unit/scheduler/test_strategies.py b/tests/unit/scheduler/test_strategies.py
@@ -2,6 +2,7 @@
 
 import math
 import time
+from multiprocessing import get_context
 from typing import Literal, TypeVar
 
 import pytest
@@ -502,7 +503,9 @@ async def test_timing_without_rampup(self):
         ### WRITTEN BY AI ###
         """
         strategy = AsyncConstantStrategy(rate=10.0, rampup_duration=0.0)
-        strategy.init_processes_timings(worker_count=1, max_concurrency=100)
+        strategy.init_processes_timings(
+            worker_count=1, max_concurrency=100, mp_context=get_context()
+        )
         start_time = 1000.0
         strategy.init_processes_start(start_time)
 
@@ -525,7 +528,9 @@ async def test_timing_with_rampup(self):
         rate = 10.0
         rampup_duration = 2.0
         strategy = AsyncConstantStrategy(rate=rate, rampup_duration=rampup_duration)
-        strategy.init_processes_timings(worker_count=1, max_concurrency=100)
+        strategy.init_processes_timings(
+            worker_count=1, max_concurrency=100, mp_context=get_context()
+        )
         start_time = 1000.0
         strategy.init_processes_start(start_time)
 
@@ -574,7 +579,9 @@ async def test_timing_with_rampup_edge_cases(self):
 
         # Test with very short rampup_duration
         strategy = AsyncConstantStrategy(rate=100.0, rampup_duration=0.01)
-        strategy.init_processes_timings(worker_count=1, max_concurrency=100)
+        strategy.init_processes_timings(
+            worker_count=1, max_concurrency=100, mp_context=get_context()
+        )
         start_time = 2000.0
         strategy.init_processes_start(start_time)
 
@@ -584,7 +591,9 @@ async def test_timing_with_rampup_edge_cases(self):
 
         # Test with very long rampup_duration
         strategy2 = AsyncConstantStrategy(rate=1.0, rampup_duration=100.0)
-        strategy2.init_processes_timings(worker_count=1, max_concurrency=100)
+        strategy2.init_processes_timings(
+            worker_count=1, max_concurrency=100, mp_context=get_context()
+        )
         start_time2 = 3000.0
         strategy2.init_processes_start(start_time2)
 
@@ -613,7 +622,9 @@ async def test_timing_rampup_transition(self):
         rate = 10.0
         rampup_duration = 2.0
         strategy = AsyncConstantStrategy(rate=rate, rampup_duration=rampup_duration)
-        strategy.init_processes_timings(worker_count=1, max_concurrency=100)
+        strategy.init_processes_timings(
+            worker_count=1, max_concurrency=100, mp_context=get_context()
+        )
         start_time = 5000.0
         strategy.init_processes_start(start_time)