Add support for multi-threaded benchmarks to the parallel runner

andreas-abel · copybara-github · commit 3faa7ebca56d · 2025-08-12T08:21:00.000-07:00
PiperOrigin-RevId: 794119102
Change-Id: I07a1aefdfcfbdf9767452822c399f7fc8bf2c607
diff --git a/fleetbench/parallel/parallel_bench.py b/fleetbench/parallel/parallel_bench.py
@@ -130,6 +130,14 @@
     " default weights.",
 )
 
+_CUSTOM_BENCHMARK_THREADS = flags.DEFINE_multi_string(
+    "benchmark_threads",
+    [],
+    "Number of threads to use for selected benchmarks. The input should be in"
+    " the format of <benchmark_name|benchmark_filter>:<n_threads>. Benchmarks"
+    " for which no thread count is specified will use one thread by default.",
+)
+
 _NUM_CPUS = flags.DEFINE_integer(
     "num_cpus",
     len(cpu.Available()),
@@ -169,6 +177,35 @@
 )
 
 
+def _ParseBenchmarkThreads(
+    benchmark_threads_list: list[str],
+) -> dict[str, int]:
+  """Parses a list of benchmark thread count specs into a dictionary.
+
+  The string element in the list should be in the format:
+  <benchmark_name|benchmark_filter>:<n_threads>.
+
+  Args:
+    benchmark_threads_list: A list of strings to parse.
+
+  Returns:
+    A dictionary of {<benchmark_name|benchmark_filter>: <n_threads>}.
+  """
+  benchmark_threads = {}
+  for spec in benchmark_threads_list:
+    try:
+      benchmark, n_threads = spec.rsplit(":", maxsplit=1)
+      benchmark_threads[benchmark] = int(n_threads)
+    except ValueError:
+      logging.warning(
+          "Invalid benchmark string: %s. The format should be"
+          " <benchmark_name|benchmark_filter>:<n_threads>. Skipping...",
+          spec,
+      )
+
+  return benchmark_threads
+
+
 def main(argv: Sequence[str]) -> None:
   if len(argv) > 1:
     raise app.UsageError("Too many command-line arguments.")
@@ -201,6 +238,7 @@ def main(argv: Sequence[str]) -> None:
       temp_parent_root=_TEMP_ROOT.value,
       keep_raw_data=_KEEP_RAW_DATA.value,
       benchmark_perf_counters=_BENCHMARK_PERF_COUNTERS.value,
+      benchmark_threads=_ParseBenchmarkThreads(_CUSTOM_BENCHMARK_THREADS.value),
   )
 
   bench.SetWeights(
diff --git a/fleetbench/parallel/parallel_bench_lib.py b/fleetbench/parallel/parallel_bench_lib.py
@@ -14,14 +14,14 @@
 
 """Run Fleetbench benchmarks in parallel."""
 
+import collections
 import dataclasses
 import json
 import math
 import os
 import shutil
 import threading
 import time
-from typing import Any
 
 from absl import logging
 import numpy as np
@@ -74,6 +74,7 @@ class ParallelBench:
     cpu_affinity: Whether to bind each worker to a CPU or allow the scheduler to
       move them around.
     benchmark_weights: Whether to use adaptive benchmark selection.
+    benchmark_threads: Number of threads to use for selected benchmarks.
     benchmarks: List of benchmarks to run.
     target_utilization: Target utilization from 0 to 1.
     duration: How long in seconds to run for.
@@ -107,6 +108,7 @@ def __init__(
       temp_parent_root: str,
       keep_raw_data: bool,
       benchmark_perf_counters: str,
+      benchmark_threads: dict[str, int],
   ):
     """Initialize the parallel benchmark runner."""
 
@@ -118,6 +120,7 @@ def __init__(
     self.cpus = cpus[1:]
     self.cpu_affinity = cpu_affinity
     self.benchmark_weights: dict[str, float] = {}
+    self.benchmark_threads = benchmark_threads
     self.benchmarks: dict[str, bm.Benchmark] = {}
     self.target_utilization = utilization * 100
     self.duration = duration
@@ -350,8 +353,8 @@ def _RunSchedulingLoop(self) -> None:
 
       self.utilization_samples.append((pd.Timestamp.now(), total_utilization))
 
-      least_busy_cpus = sorted(
-          utilization_per_cpu.keys(), key=utilization_per_cpu.get
+      least_busy_cpus = collections.OrderedDict(
+          sorted(utilization_per_cpu.items(), key=lambda item: item[1])
       )
 
       # E.g. we are at 50% utilization, target is 70%.
@@ -374,14 +377,40 @@ def _RunSchedulingLoop(self) -> None:
             benchmark=benchmark,
             out_file=path,
         )
-        for cpu_id in least_busy_cpus:
-          if self.workers[cpu_id].TryAddRun(r):
+        required_n_threads = self.benchmark_threads.get(
+            benchmark.BenchmarkName(), 1
+        )
+        # If required_n_threads > 1, we will try to reserve required_n_threads-1
+        # additional workers, whose CPUs we can temporarily assign to the main
+        # worker for the benchmark.
+        extra_workers = []
+        for cpu_id in list(least_busy_cpus.keys()):
+          if required_n_threads > 1:
+            if self.workers[cpu_id].TryBlock():
+              logging.debug("Reserving CPU %d for %s", cpu_id, benchmark)
+              extra_workers.append(self.workers[cpu_id])
+              del least_busy_cpus[cpu_id]
+              required_n_threads -= 1
+          elif self.workers[cpu_id].TryAddRun(r, extra_workers):
             next_run_id += 1
-            logging.debug("Scheduling %s on CPU %d", benchmark, cpu_id)
+            if extra_workers:
+              logging.debug(
+                  "Scheduling %s on CPUs %s",
+                  benchmark,
+                  [cpu_id]
+                  + [extra_worker.cpu for extra_worker in extra_workers],
+              )
+            else:
+              logging.debug("Scheduling %s on CPU %d", benchmark, cpu_id)
             # We just added something to this worker, so presumably
             # trying to add the next benchmark to it will fail.
-            least_busy_cpus.remove(cpu_id)
+            del least_busy_cpus[cpu_id]
             break
+        else:
+          # We did not find enough CPUs for the benchmark. If we blocked any
+          # extra workers, we will unblock them here.
+          for w in extra_workers:
+            w.Unblock()
 
       # Process any available results. This updates the runtimes of each
       # benchmark to make the scheduling probabilities more accurate.
diff --git a/fleetbench/parallel/parallel_bench_lib_test.py b/fleetbench/parallel/parallel_bench_lib_test.py
@@ -44,6 +44,7 @@ def setUp(self):
         temp_parent_root=absltest.get_default_test_tmpdir(),
         keep_raw_data=True,
         benchmark_perf_counters="",
+        benchmark_threads={},
     )
 
   def tearDown(self):
@@ -104,6 +105,7 @@ def fake_utilization(unused_cpus):
         temp_parent_root=absltest.get_default_test_tmpdir(),
         keep_raw_data=True,
         benchmark_perf_counters="",
+        benchmark_threads={},
     )
     self.pb.SetWeights(
         benchmark_target="fake_bench",
@@ -188,6 +190,68 @@ def test_set_extra_benchmark_flags(self):
         ],
     )
 
+  @mock.patch.object(bm, "GetSubBenchmarks", autospec=True)
+  @mock.patch.object(run.Run, "Execute", autospec=True)
+  @mock.patch.object(cpu, "Utilization", autospec=True)
+  @mock.patch.object(reporter, "GenerateBenchmarkReport", autospec=True)
+  @mock.patch.object(
+      reporter, "SaveBenchmarkResults", autospec=True, return_value=(None, None)
+  )
+  @flagsaver.flagsaver(
+      benchmark_dir=absltest.get_default_test_tmpdir(),
+  )
+  def testRunThreads(
+      self,
+      mock_save_benchmark_results,
+      mock_generate_benchmark_report,
+      mock_utilization,
+      mock_execute,
+      mock_get_subbenchmarks,
+  ):
+    mock_get_subbenchmarks.return_value = ["BM_Test1"]
+    mock_execute.return_value = result.Result(
+        benchmark="fake_bench (BM_Test1)",
+        rc=0,
+        stdout="fake_stdout",
+        stderr="fake_stderr",
+        duration=0.01,
+        bm_cpu_time=0.01,
+        result="fake_result",
+    )
+    self.create_tempfile(
+        os.path.join(absltest.get_default_test_tmpdir(), "fake_bench")
+    )
+
+    def fake_utilization(unused_cpus):
+      # Return 0% for the first call, then 55% for the rest.
+      fake_utilizations = [(0, {1: 0, 2: 0}, 0), (55, {1: 55, 2: 55}, 1)]
+      return fake_utilizations[min(mock_utilization.call_count - 1, 1)]
+
+    mock_utilization.side_effect = fake_utilization
+
+    mock_generate_benchmark_report.return_value = pd.DataFrame()
+
+    self.pb = parallel_bench_lib.ParallelBench(
+        cpus=[0, 1, 2],
+        cpu_affinity=False,
+        utilization=0.5,
+        duration=0.1,
+        repetitions=1,
+        temp_parent_root=absltest.get_default_test_tmpdir(),
+        keep_raw_data=True,
+        benchmark_perf_counters="",
+        benchmark_threads={"BM_Test1": 2},
+    )
+    self.pb.SetWeights(
+        benchmark_target="fake_bench",
+        benchmark_filter=None,
+        workload_filter=None,
+        scheduling_strategy=weights.SchedulingStrategy.BM_WEIGHTED,
+        custom_benchmark_weights=None,
+    )
+    self.pb.Run()
+    mock_execute.assert_called_once()
+
   def test_convert_to_dataframe(self):
     # First entries are fake durations, the second entries are real durations.
     self.pb.runtimes["BM_Test1"] = [
diff --git a/fleetbench/parallel/worker.py b/fleetbench/parallel/worker.py
@@ -14,17 +14,26 @@
 
 """Worker for parallel benchmark."""
 
+import dataclasses
 import os
 import queue
 import threading
-from typing import Optional
+from typing import Optional, Self
 
 from absl import logging
 
 from fleetbench.parallel import result
 from fleetbench.parallel import run as parallel_run
 
 
+@dataclasses.dataclass
+class RunAndExtraWorkers:
+  """Groups a benchmark run with any additional workers it needs."""
+
+  run: parallel_run.Run
+  extra_workers: list["Worker"]
+
+
 class Worker(threading.Thread):
   """Per-CPU worker for running benchmarks.
 
@@ -34,32 +43,57 @@ class Worker(threading.Thread):
     result_q: Queue of results from completed runs.
     cpu: The CPU number this worker is assigned to.
     affinity: If true, bind this thread to the assigned CPU.
+    lock: Lock to temporarily block this worker while its CPU is being used by
+      another worker that requires more than one CPU.
+    in_use_as_extra_worker: Whether this worker is currently being used as an
+      extra worker.
   """
 
   def __init__(self, cpu: int, affinity: bool):
     super().__init__()
-    self._command_q: queue.Queue[Optional[parallel_run.Run]] = queue.Queue(
+    self._command_q: queue.Queue[Optional[RunAndExtraWorkers]] = queue.Queue(
         maxsize=1
     )
     self._result_q: queue.Queue[Optional[result.Result]] = queue.Queue()
-    self._cpu = cpu
+    self.cpu = cpu
     self._affinity = affinity
+    self._lock = threading.Lock()
+    self._in_use_as_extra_worker = False
 
-  def TryAddRun(self, run: parallel_run.Run) -> bool:
+  def TryAddRun(self, run: parallel_run.Run, extra_workers: list[Self]) -> bool:
     """Tries to add a run to the worker's queue.
 
     Args:
       run: The run to add to the queue.
+      extra_workers: The extra workers to add to the queue.
 
     Returns:
       True if successful, False if the queue is full.
     """
+    if self._in_use_as_extra_worker:
+      return False
     try:
-      self._command_q.put_nowait(run)
+      self._command_q.put_nowait(RunAndExtraWorkers(run, extra_workers))
       return True
     except queue.Full:
       return False
 
+  def TryBlock(self) -> bool:
+    """Tries to block the worker so that its CPU can be used by another worker.
+
+    Returns:
+      True if successful, False if the worker is busy.
+    """
+    lock_acquired = self._lock.acquire(blocking=False)
+    if lock_acquired:
+      self._in_use_as_extra_worker = True
+    return lock_acquired
+
+  def Unblock(self) -> None:
+    """Resume the worker's normal operation."""
+    self._in_use_as_extra_worker = False
+    self._lock.release()
+
   def StopAndGetResults(self) -> list[result.Result]:
     """Shut down the worker loop, then wait for results."""
     self._command_q.put(None)
@@ -87,15 +121,26 @@ def TryGetResults(self) -> list[result.Result]:
   def run(self):
     """Called when the thread is started. Loops executing commands."""
     if self._affinity:
-      os.sched_setaffinity(threading.get_native_id(), [self._cpu])
-    logging.debug("Worker %d running", self._cpu)
+      os.sched_setaffinity(threading.get_native_id(), [self.cpu])
+    logging.debug("Worker %d running", self.cpu)
 
     while True:
-      run_object = self._command_q.get()
-      if run_object is None:
-        logging.debug("Worker %d shutting down", self._cpu)
+      task = self._command_q.get()
+      if task is None:
+        logging.debug("Worker %d shutting down", self.cpu)
         self._result_q.put(None)
         break
-      self._result_q.put(run_object.Execute())
-
-    logging.debug("Worker %d exiting", self._cpu)
+      with self._lock:
+        extra_workers = task.extra_workers
+        if extra_workers and self._affinity:
+          os.sched_setaffinity(
+              threading.get_native_id(),
+              [self.cpu] + [extra_worker.cpu for extra_worker in extra_workers],
+          )
+        self._result_q.put(task.run.Execute())
+        if extra_workers and self._affinity:
+          os.sched_setaffinity(threading.get_native_id(), [self.cpu])
+          for extra_worker in extra_workers:
+            extra_worker.Unblock()
+
+    logging.debug("Worker %d exiting", self.cpu)
diff --git a/fleetbench/parallel/worker_test.py b/fleetbench/parallel/worker_test.py