fix: cap the default number of max workers to 32

ajcasagrande · nv-nmailhot · commit c2776b11393e · 2025-09-16T11:45:15.000-07:00
- changes the formula for record-processors to be 1 for every 4 workers if not specified
- changes formula for default workers to min(concurrency, (cpus * 0.75) - 1)
- caps the max workers to 32 regardless of cpu count unless the user specifies a --workers-max
diff --git a/aiperf/common/config/worker_config.py b/aiperf/common/config/worker_config.py
@@ -8,6 +8,7 @@
 from aiperf.common.config.cli_parameter import CLIParameter, DisableCLI
 from aiperf.common.config.config_defaults import WorkersDefaults
 from aiperf.common.config.groups import Groups
+from aiperf.common.constants import DEFAULT_MAX_WORKERS_CAP
 
 
 class WorkersConfig(BaseConfig):
@@ -27,7 +28,9 @@ class WorkersConfig(BaseConfig):
         int | None,
         Field(
             description="Maximum number of workers to create. If not specified, the number of"
-            " workers will be determined by the smaller of (concurrency + 1) and (num CPUs - 1).",
+            " workers will be determined by the formula `min(concurrency, (num CPUs * 0.75) - 1)`, "
+            f" with a default max cap of `{DEFAULT_MAX_WORKERS_CAP}`. Any value provided will still be capped by"
+            f" the concurrency value (if specified), but not by the max cap.",
         ),
         CLIParameter(
             name=("--workers-max", "--max-workers"),
diff --git a/aiperf/common/constants.py b/aiperf/common/constants.py
@@ -91,5 +91,13 @@
 DEFAULT_WORKER_HEALTH_CHECK_INTERVAL = 2.0
 """Default interval in seconds between worker health check messages."""
 
+DEFAULT_RECORD_PROCESSOR_SCALE_FACTOR = 4
+"""Default scale factor for the number of record processors to spawn based on the number of workers.
+This will spawn 1 record processor for every X workers."""
+
+DEFAULT_MAX_WORKERS_CAP = 32
+"""Default absolute maximum number of workers to spawn, regardless of the number of CPU cores.
+Only applies if the user does not specify a max workers value."""
+
 DEFAULT_ZMQ_CONTEXT_TERM_TIMEOUT = 10.0
 """Default timeout for terminating the ZMQ context in seconds."""
diff --git a/aiperf/controller/system_controller.py b/aiperf/controller/system_controller.py
@@ -15,6 +15,7 @@
     AIPERF_DEV_MODE,
     DEFAULT_PROFILE_CONFIGURE_TIMEOUT,
     DEFAULT_PROFILE_START_TIMEOUT,
+    DEFAULT_RECORD_PROCESSOR_SCALE_FACTOR,
 )
 from aiperf.common.enums import (
     CommandResponseStatus,
@@ -326,7 +327,8 @@ async def _handle_spawn_workers_command(self, message: SpawnWorkersCommand) -> N
         # If we are scaling the record processor service count with the number of workers, spawn the record processors
         if self.scale_record_processors_with_workers:
             await self.service_manager.run_service(
-                ServiceType.RECORD_PROCESSOR, max(1, message.num_workers // 2)
+                ServiceType.RECORD_PROCESSOR,
+                max(1, message.num_workers // DEFAULT_RECORD_PROCESSOR_SCALE_FACTOR),
             )
 
     @on_command(CommandType.SHUTDOWN_WORKERS)
diff --git a/aiperf/workers/worker_manager.py b/aiperf/workers/worker_manager.py
@@ -9,6 +9,7 @@
 from aiperf.common.bootstrap import bootstrap_and_run_service
 from aiperf.common.config import ServiceConfig, UserConfig
 from aiperf.common.constants import (
+    DEFAULT_MAX_WORKERS_CAP,
     DEFAULT_WORKER_CHECK_INTERVAL,
     DEFAULT_WORKER_ERROR_RECOVERY_TIME,
     DEFAULT_WORKER_HIGH_LOAD_CPU_USAGE,
@@ -75,20 +76,25 @@ def __init__(
         self.max_concurrency = self.user_config.loadgen.concurrency
         self.max_workers = self.service_config.workers.max
         if self.max_workers is None:
-            # Default to the number of CPU cores - 1
-            self.max_workers = self.cpu_count - 1
+            # Default to 75% of the CPU cores - 1, with a cap of DEFAULT_MAX_WORKERS_CAP, and a minimum of 1
+            self.max_workers = max(
+                1, min(int(self.cpu_count * 0.75) - 1, DEFAULT_MAX_WORKERS_CAP)
+            )
+            self.debug(
+                lambda: f"Auto-setting max workers to {self.max_workers} due to no max workers specified."
+            )
 
         # Cap the worker count to the max concurrency, but only if the user is in concurrency mode.
-        if self.max_concurrency:
-            self.max_workers = min(
-                self.max_concurrency,
-                self.max_workers,
+        if self.max_concurrency and self.max_concurrency < self.max_workers:
+            self.max_workers = self.max_concurrency
+            self.debug(
+                lambda: f"Capping max workers to {self.max_workers} due to concurrency."
             )
 
         # Ensure we have at least the min workers
         self.max_workers = max(
             self.max_workers,
-            self.service_config.workers.min or 0,
+            self.service_config.workers.min or 1,
         )
         self.initial_workers = self.max_workers
 
diff --git a/docs/cli_options.md b/docs/cli_options.md
@@ -157,8 +157,9 @@ The following options are available when profiling using AIPerf.
 ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
 ```
-╭─ Workers ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ WORKERS-MAX --workers-max --max-workers  Maximum number of workers to create. If not specified, the number of workers will be determined by the smaller of (concurrency + 1) and (num │
-│                                          CPUs - 1).                                                                                                                                   │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
+╭─ Workers ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ WORKERS-MAX --workers-max --max-workers  Maximum number of workers to create. If not specified, the number of workers will be determined by the formula             │
+│                                          min(concurrency, (num CPUs * 0.75) - 1),  with a default max cap of 32. Any value provided will still be capped by the     │
+│                                          concurrency value (if specified), but not by the max cap.                                                                  │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
diff --git a/tests/workers/test_worker_manager.py b/tests/workers/test_worker_manager.py
@@ -18,30 +18,69 @@ class TestMaxWorkers:
     """Test the max workers calculation logic in WorkerManager."""
 
     @pytest.mark.parametrize(
-        "concurrency,request_rate,max_workers,expected",
+        "cpus,concurrency,max_workers,expected",
         [
-            (None, 1000, None, 9),  # Default case (10 fake CPUs - 1)
-            (None, 1000, 4, 4),  # Only max set
-            (None, None, None, 1),  # Concurrency defaults to 1
-            (3, None, None, 3),  # Only concurrency set
-            (2, None, 5, 2),  # Concurrency limits max
-            (8, None, 3, 3),  # Max limits concurrency
-            (10, 1000, 5, 5),  # Normal case with all values
+            (10, 100, None, 6),  # CPU-based limit: 10 * 0.75 - 1 = 6
+            (10, 100, 4, 4),  # max_workers setting limits to 4
+            (10, None, None, 1),  # Concurrency defaults to 1, which limits workers to 1
+            (10,3,None,3),  # Low concurrency (3) limits workers below CPU calculation
+            (10, 8, 3, 3),  # max_workers (3) overrides higher concurrency (8)
+            (10, 10, 5, 5),  # max_workers (5) overrides higher concurrency (10)
+            (224, 1000, None, 32),  # High CPU count with hard cap at 32 workers
+            (32, 1000, None, 23),  # CPU-based limit: 32 * 0.75 - 1 = 23
+            (1, 100, None, 1),  # Single CPU system, should default to 1 worker minimum
+            (2, 100, None, 1),  # Two CPUs: 2 * 0.75 - 1 = 0.5, rounded up to 1
+            (4, 100, None, 2),  # Four CPUs: 4 * 0.75 - 1 = 2
+            (44,1000,None,32),  # CPU count that would exceed 32 limit: 44 * 0.75 - 1 = 32
+            (45,1000,None,32),  # CPU count that hits the hard cap: 45 * 0.75 - 1 = 32.75
+            (4, 100, 100, 100),  # Very high max_workers, not limited by CPU calculation
+            (64, 1, None, 1),  # Concurrency of 1 limits to 1 worker regardless of CPUs
         ],
-    )
-    def test_max_workers_combinations(
-        self, concurrency, request_rate, max_workers, expected
+    )  # fmt: skip
+    def test_max_workers_combinations(self, cpus, concurrency, max_workers, expected):
+        """Test max workers calculation with different CPU counts, concurrency, and max_workers settings."""
+        with patch(
+            "aiperf.workers.worker_manager.multiprocessing.cpu_count", return_value=cpus
+        ):
+            service_config = ServiceConfig(workers=WorkersConfig(max=max_workers))
+            user_config = UserConfig(
+                endpoint=EndpointConfig(model_names=["test-model"]),
+                loadgen=LoadGeneratorConfig(concurrency=concurrency),
+            )
+
+            worker_manager = WorkerManager(
+                service_config=service_config,
+                user_config=user_config,
+                service_id="test-worker-manager",
+            )
+
+            assert worker_manager.max_workers == expected
+
+    @pytest.mark.parametrize(
+        "cpus,request_rate,max_workers,expected",
+        [
+            (10, 50, None, 6),  # CPU-based limit: 10 * 0.75 - 1 = 6 (no concurrency limit)
+            (10, 100, 4, 4),  # max_workers setting limits to 4
+            (4, 10, None, 2),  # Low CPU count: 4 * 0.75 - 1 = 2
+            (2, 50, None, 1),  # Very low CPU: 2 * 0.75 - 1 = 0.5, rounded up to 1
+            (1, 100, None, 1),  # Single CPU system minimum
+            (64, 500, None, 32),  # High CPU count with hard cap at 32 workers
+            (10, 1, None, 6),  # Very low request rate still uses CPU calculation
+            (10, 1000, 8, 8),  # High request rate with max_workers override
+            (8, 50, 20, 20),  # max_workers higher than CPU calc
+        ],
+    )  # fmt: skip
+    def test_max_workers_with_request_rate_combinations(
+        self, cpus, request_rate, max_workers, expected
     ):
-        """Test various combinations of configuration values."""
+        """Test max workers calculation with request_rate mode where concurrency is 0/None."""
         with patch(
-            "aiperf.workers.worker_manager.multiprocessing.cpu_count", return_value=10
+            "aiperf.workers.worker_manager.multiprocessing.cpu_count", return_value=cpus
         ):
             service_config = ServiceConfig(workers=WorkersConfig(max=max_workers))
             user_config = UserConfig(
                 endpoint=EndpointConfig(model_names=["test-model"]),
-                loadgen=LoadGeneratorConfig(
-                    concurrency=concurrency, request_rate=request_rate
-                ),
+                loadgen=LoadGeneratorConfig(request_rate=request_rate),
             )
 
             worker_manager = WorkerManager(