test(loadgen): add unit tests for worker concurrency distribution

sats-23 · sats-23 · commit 32dafa82c67b · 2026-02-02T14:27:56.000+05:30
Add a new test suite to validate how LoadGenerator splits concurrency across workers.
The tests cover evenly divisible cases, remainder handling, and scenarios where concurrency is lower than the number of workers.
This also fixes Python 3.9 compatibility by replacing Type | None with Optional[Type], and adds lightweight test-time mocks for asyncio.TaskGroup and typing.TypeAlias so the suite runs cleanly on Python versions below 3.11.

Signed-off-by: Sathvik &lt;Sathvik.S@ibm.com&gt;
diff --git a/e2e/utils/llm_d_inference_sim.py b/e2e/utils/llm_d_inference_sim.py
@@ -5,6 +5,7 @@
 import textwrap
 import shutil
 from contextlib import AsyncContextDecorator
+from typing import Optional
 
 
 logger = logging.getLogger(__name__)
@@ -24,7 +25,7 @@ def is_available(executable: str = "llm-d-inference-sim") -> bool:
 
     _host = "127.0.0.1"
     _port: int
-    _proc: asyncio.subprocess.Process | None = None
+    _proc: "Optional[asyncio.subprocess.Process]" = None
     _wait_until_ready: bool
 
     def __init__(
@@ -91,7 +92,7 @@ async def __aexit__(self, *exc):
     async def wait_until_ready(
         self,
         polling_sec: float = 0.5,
-        timeout_sec: float | None = 10,
+        timeout_sec: Optional[float] = 10,
     ) -> None:
         """Waits until the server is ready to serve requests."""
         assert self._proc
diff --git a/inference_perf/analysis/analyze.py b/inference_perf/analysis/analyze.py
@@ -16,12 +16,12 @@
 import logging
 import operator
 from pathlib import Path
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 logger = logging.getLogger(__name__)
 
 
-def _extract_latency_metric(latency_data: Dict[str, Any], metric_name: str, convert_to_ms: bool = False) -> float | None:
+def _extract_latency_metric(latency_data: Dict[str, Any], metric_name: str, convert_to_ms: bool = False) -> Optional[float]:
     """Helper to extract a metric's mean value from latency data."""
     metric_data = latency_data.get(metric_name)
     if isinstance(metric_data, dict):
@@ -31,7 +31,7 @@ def _extract_latency_metric(latency_data: Dict[str, Any], metric_name: str, conv
     return None
 
 
-def _extract_throughput_metric(throughput_data: Dict[str, Any], metric_name: str) -> float | None:
+def _extract_throughput_metric(throughput_data: Dict[str, Any], metric_name: str) -> Optional[float]:
     """Helper to extract a throughput metric's value."""
     metric_value = throughput_data.get(metric_name)
     if isinstance(metric_value, (int, float)):
diff --git a/inference_perf/client/metricsclient/base.py b/inference_perf/client/metricsclient/base.py
@@ -97,11 +97,11 @@ def __init__(self) -> None:
         pass
 
     @abstractmethod
-    def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None:
+    def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> Optional[ModelServerMetrics]:
         raise NotImplementedError
 
     @abstractmethod
-    def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None:
+    def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> Optional[ModelServerMetrics]:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/inference_perf/client/metricsclient/mock_client.py b/inference_perf/client/metricsclient/mock_client.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .base import MetricsClient, PerfRuntimeParameters, ModelServerMetrics
+from typing import Optional
 
 
 class MockMetricsClient(MetricsClient):
     def __init__(self) -> None:
         pass
 
-    def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None:
+    def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> Optional[ModelServerMetrics]:
         return None
 
-    def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None:
+    def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> Optional[ModelServerMetrics]:
         return None
 
     def wait(self) -> None:
diff --git a/inference_perf/client/metricsclient/prometheus_client/base.py b/inference_perf/client/metricsclient/prometheus_client/base.py
@@ -14,7 +14,7 @@
 from abc import abstractmethod
 import logging
 import time
-from typing import List, cast, Any
+from typing import List, cast, Any, Optional
 import requests
 from inference_perf.client.modelserver.base import ModelServerPrometheusMetric
 from inference_perf.config import PrometheusClientConfig
@@ -184,7 +184,7 @@ def wait(self) -> None:
         wait_time = self.scrape_interval + PROMETHEUS_SCRAPE_BUFFER_SEC
         time.sleep(wait_time)
 
-    def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None:
+    def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> Optional[ModelServerMetrics]:
         """
         Collects the summary metrics for the given Perf Benchmark run.
 
@@ -204,7 +204,7 @@ def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) ->
 
         return self.get_model_server_metrics(runtime_parameters.model_server_metrics, query_duration, query_eval_time)
 
-    def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None:
+    def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> Optional[ModelServerMetrics]:
         """
         Collects the summary metrics for a specific stage.
 
@@ -235,7 +235,7 @@ def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, s
 
     def get_model_server_metrics(
         self, metrics_metadata: MetricsMetadata, query_duration: float, query_eval_time: float
-    ) -> ModelServerMetrics | None:
+    ) -> Optional[ModelServerMetrics]:
         """
         Collects the summary metrics for the given Model Server Client and query duration.
 
diff --git a/inference_perf/client/modelserver/openai_client.py b/inference_perf/client/modelserver/openai_client.py
@@ -31,7 +31,7 @@
 
 
 class openAIModelServerClient(ModelServerClient):
-    _session: "openAIModelServerClientSession | None" = None
+    _session: "Optional[openAIModelServerClientSession]" = None
     _session_lock = asyncio.Lock()
 
     def __init__(
diff --git a/inference_perf/client/requestdatacollector/multiprocess.py b/inference_perf/client/requestdatacollector/multiprocess.py
@@ -16,7 +16,7 @@
 
 from asyncio import get_event_loop, create_task
 from contextlib import asynccontextmanager
-from typing import AsyncIterator
+from typing import AsyncIterator, Optional
 from functools import partial
 import logging
 from inference_perf.client.requestdatacollector import RequestDataCollector
@@ -30,7 +30,7 @@ class MultiprocessRequestDataCollector(RequestDataCollector):
     """Responsible for accumulating client request metrics"""
 
     def __init__(self) -> None:
-        self.queue: "mp.JoinableQueue[RequestLifecycleMetric | None]" = mp.JoinableQueue()
+        self.queue: "mp.JoinableQueue[Optional[RequestLifecycleMetric]]" = mp.JoinableQueue()
 
     def record_metric(self, metric: RequestLifecycleMetric) -> None:
         self.queue.put(metric)
diff --git a/inference_perf/loadgen/load_generator.py b/inference_perf/loadgen/load_generator.py
@@ -31,15 +31,29 @@
 from asyncio import (
     CancelledError,
     Semaphore,
-    TaskGroup,
     create_task,
     gather,
     run,
     sleep,
     set_event_loop_policy,
     get_event_loop,
 )
-from typing import List, Tuple, TypeAlias, Optional
+
+try:
+    from asyncio import TaskGroup
+except ImportError:
+    # Python 3.9 compatibility: TaskGroup was added in 3.11
+    # This is a dummy for import-time compatibility.
+    # Runtime usage will still require Python 3.11+.
+    TaskGroup = object
+
+from typing import List, Tuple, Optional, Union
+try:
+    from typing import TypeAlias
+except ImportError:
+    # Python 3.9 compatibility: TypeAlias was added in 3.10
+    from typing import Any
+    TypeAlias = Any
 from types import FrameType
 import time
 import multiprocessing as mp
@@ -55,7 +69,7 @@
 
 logger = logging.getLogger(__name__)
 
-RequestQueueData: TypeAlias = Tuple[int, InferenceAPIData | int, float, Optional[str]]
+RequestQueueData: TypeAlias = Tuple[int, Union[InferenceAPIData, int], float, Optional[str]]
 
 
 class Worker(mp.Process):
diff --git a/tests/loadgen/test_load_generator.py b/tests/loadgen/test_load_generator.py
@@ -0,0 +1,90 @@
+import unittest
+from unittest.mock import MagicMock
+import multiprocessing as mp
+import asyncio
+import typing
+
+# Patch asyncio.TaskGroup for Python < 3.11
+if not hasattr(asyncio, 'TaskGroup'):
+    class MockTaskGroup:
+        async def __aenter__(self):
+            return self
+        async def __aexit__(self, exc_type, exc_val, exc_tb):
+            pass
+        def create_task(self, coro):
+            return asyncio.create_task(coro)
+    asyncio.TaskGroup = MockTaskGroup
+
+# Patch typing.TypeAlias for Python < 3.10
+if not hasattr(typing, 'TypeAlias'):
+    typing.TypeAlias = typing.Any
+
+from inference_perf.loadgen.load_generator import LoadGenerator
+from inference_perf.config import LoadConfig, LoadType
+
+class MockWorker:
+    def __init__(self, id, shared_max_concurrency):
+        self.id = id
+        self.shared_max_concurrency = shared_max_concurrency
+
+class TestLoadGeneratorConcurrency(unittest.TestCase):
+    def setUp(self):
+        self.mock_datagen = MagicMock()
+        self.load_config = LoadConfig(
+            type=LoadType.CONCURRENT,
+            num_workers=4,
+            worker_max_concurrency=100
+        )
+        # Mocking get_circuit_breaker since LoadGenerator init calls it
+        with unittest.mock.patch('inference_perf.loadgen.load_generator.get_circuit_breaker'):
+            self.load_generator = LoadGenerator(self.mock_datagen, self.load_config)
+
+    def test_set_worker_concurrency_divisible(self):
+        # Setup workers
+        self.load_generator.workers = []
+        for i in range(4):
+            shared_val = mp.Value('i', 0)
+            self.load_generator.workers.append(MockWorker(i, shared_val))
+
+        # Test concurrency_level = 8 (8 / 4 = 2 per worker)
+        self.load_generator._set_worker_concurrency(8)
+        
+        for worker in self.load_generator.workers:
+            self.assertEqual(worker.shared_max_concurrency.value, 2, f"Worker {worker.id} should have concurrency 2")
+
+    def test_set_worker_concurrency_remainder(self):
+        # Setup workers
+        self.load_generator.workers = []
+        for i in range(4):
+            shared_val = mp.Value('i', 0)
+            self.load_generator.workers.append(MockWorker(i, shared_val))
+
+        # Test concurrency_level = 10 (10 // 4 = 2, 10 % 4 = 2)
+        # Workers 0, 1 should have 3
+        # Workers 2, 3 should have 2
+        self.load_generator._set_worker_concurrency(10)
+        
+        self.assertEqual(self.load_generator.workers[0].shared_max_concurrency.value, 3)
+        self.assertEqual(self.load_generator.workers[1].shared_max_concurrency.value, 3)
+        self.assertEqual(self.load_generator.workers[2].shared_max_concurrency.value, 2)
+        self.assertEqual(self.load_generator.workers[3].shared_max_concurrency.value, 2)
+
+    def test_set_worker_concurrency_less_than_workers(self):
+        # Setup workers
+        self.load_generator.workers = []
+        for i in range(4):
+            shared_val = mp.Value('i', 0)
+            self.load_generator.workers.append(MockWorker(i, shared_val))
+
+        # Test concurrency_level = 3
+        # Workers 0, 1, 2 should have 1
+        # Worker 3 should have 0
+        self.load_generator._set_worker_concurrency(3)
+        
+        self.assertEqual(self.load_generator.workers[0].shared_max_concurrency.value, 1)
+        self.assertEqual(self.load_generator.workers[1].shared_max_concurrency.value, 1)
+        self.assertEqual(self.load_generator.workers[2].shared_max_concurrency.value, 1)
+        self.assertEqual(self.load_generator.workers[3].shared_max_concurrency.value, 0)
+
+if __name__ == '__main__':
+    unittest.main()