[warmup] Change default warmup and rep time to adaptive (#1040)

xuzhao9 · web-flow · commit 0205245a7e7c · 2026-04-27T13:40:43.000-04:00
diff --git a/tritonbench/components/do_bench/run.py b/tritonbench/components/do_bench/run.py
@@ -14,6 +14,7 @@
 from .common import summarize_statistics
 from .gpu_events import do_bench_events
 from .power import do_bench_power
+from .utils import estimate_cuda_runtime_ms, resolve_warmup_and_rep
 
 NS_TO_MS = 1e-6
 logger = logging.getLogger(__name__)
@@ -219,6 +220,7 @@ def _do_bench_cudagraph_with_cache_clear(
         end_event.record()
         torch.cuda.synchronize()
         estimate_ms = start_event.elapsed_time(end_event) / 5
+        _, rep = resolve_warmup_and_rep(None, rep, estimate_ms)
 
         n_repeat = 1000 if estimate_ms == 0 else max(1, int(rep / estimate_ms))
 
@@ -301,13 +303,11 @@ def _do_bench_profiler(
         else None
     )
 
-    # First, estimate the runtime to calculate iterations
-    estimate_ms = triton.testing.do_bench(
+    clear_cache_fn = cache.zero_ if not skip_cache_clearing else lambda *args: None
+    estimate_ms = estimate_cuda_runtime_ms(
         fn,
-        warmup=warmup,
-        rep=rep,
         grad_to_none=grad_to_none,
-        return_mode="mean",
+        clear_cache_fn=clear_cache_fn,
     )
 
     # Calculate number of iterations based on target rep time
@@ -316,8 +316,6 @@ def _do_bench_profiler(
     else:
         n_repeat = max(1, int(rep / estimate_ms))
 
-    clear_cache_fn = cache.zero_ if not skip_cache_clearing else lambda *args: None
-
     # Helper function to execute one iteration
     def run_iteration():
         if grad_to_none is not None:
@@ -432,7 +430,7 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
 
 
 def _do_bench_cpu(
-    fn, warmup, rep=20, grad_to_none=None, quantiles=None, return_mode="mean"
+    fn, warmup, rep, grad_to_none=None, quantiles=None, return_mode="mean"
 ):
     """Measure latency of a function on CPU."""
     assert return_mode in ["min", "max", "mean", "median", "all"]
@@ -474,8 +472,8 @@ def _do_bench_cpu(
 
 def _do_bench_entropy(
     fn,
-    warmup=25,
-    rep=100,
+    warmup,
+    rep,
     grad_to_none=None,
     quantiles=None,
     return_mode="mean",
@@ -528,6 +526,7 @@ def _do_bench_entropy(
     precision_increase = False
 
     cache = triton.runtime.driver.active.get_empty_cache_for_benchmark()
+    clear_cache_fn = lambda: triton.runtime.driver.active.clear_cache(cache)
 
     # Adaptive warmup loop with batched synchronization
     while True:
@@ -545,7 +544,7 @@ def _do_bench_entropy(
             if grad_to_none is not None:
                 for x in grad_to_none:
                     x.grad = None
-            triton.runtime.driver.active.clear_cache(cache)
+            clear_cache_fn()
             batch_start_events[i].record()
             fn()
             batch_end_events[i].record()
@@ -619,7 +618,7 @@ def _do_bench_entropy(
         if grad_to_none is not None:
             for x in grad_to_none:
                 x.grad = None
-        triton.runtime.driver.active.clear_cache(cache)
+        clear_cache_fn()
         start_events[i].record()
         fn()
         end_events[i].record()
@@ -661,6 +660,14 @@ def do_bench_wrapper(
         entropy_window_size: Size of rolling window for entropy tracking
         entropy_max_samples: Maximum samples before stopping warmup (safety limit)
     """
+    if (warmup is None or rep is None) and not repcnt:
+        estimate_runtime = estimate_cuda_runtime_ms(fn, grad_to_none=grad_to_none)
+        warmup, rep = resolve_warmup_and_rep(
+            warmup,
+            rep,
+            estimate_runtime,
+        )
+
     try:
         if device == "cpu":
             return Latency(
diff --git a/tritonbench/components/do_bench/utils.py b/tritonbench/components/do_bench/utils.py
@@ -0,0 +1,49 @@
+from typing import Callable, Iterable, Optional, Tuple
+
+import torch
+from tritonbench.utils.constants import DEFAULT_WARMUP_REP_BY_ESTIMATED_KERNEL_MS
+
+
+def resolve_warmup_and_rep(
+    warmup: Optional[int], rep: Optional[int], estimate_ms: float
+) -> Tuple[int, int]:
+    if estimate_ms <= 1:
+        default_warmup, default_rep = DEFAULT_WARMUP_REP_BY_ESTIMATED_KERNEL_MS["1"]
+    elif estimate_ms <= 10:
+        default_warmup, default_rep = DEFAULT_WARMUP_REP_BY_ESTIMATED_KERNEL_MS["10"]
+    else:
+        default_warmup, default_rep = DEFAULT_WARMUP_REP_BY_ESTIMATED_KERNEL_MS["100"]
+    return (
+        default_warmup if warmup is None else warmup,
+        default_rep if rep is None else rep,
+    )
+
+
+def estimate_cuda_runtime_ms(
+    fn: Callable,
+    grad_to_none: Optional[Iterable[torch.Tensor]] = None,
+    clear_cache_fn: Optional[Callable[[], None]] = None,
+    iters: int = 5,
+    prime: bool = True,
+) -> float:
+    clear_cache_fn = clear_cache_fn or (lambda: None)
+
+    def run_once() -> None:
+        if grad_to_none is not None:
+            for x in grad_to_none:
+                x.grad = None
+        clear_cache_fn()
+        fn()
+
+    if prime:
+        run_once()
+        torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(iters):
+        run_once()
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / iters
diff --git a/tritonbench/components/kineto/trace.py b/tritonbench/components/kineto/trace.py
@@ -8,6 +8,10 @@
 
 import torch
 import torch.profiler as profiler
+from tritonbench.components.do_bench.utils import (
+    estimate_cuda_runtime_ms,
+    resolve_warmup_and_rep,
+)
 from tritonbench.utils.constants import DEFAULT_N_REP, DEFAULT_N_WARMUP
 from tritonbench.utils.env_utils import has_manifold
 
@@ -122,8 +126,8 @@ def do_bench_kineto_cudagraph(
 
 def do_bench_kineto(
     fn: Callable,
-    warmup: int,
-    rep: int,
+    warmup: Optional[int],
+    rep: Optional[int],
     grad_to_none=None,
     fast_flush=True,
     profile_opts=None,
@@ -154,7 +158,6 @@ def do_bench_kineto(
 
     fn()
     torch.cuda.synchronize()
-
     # We maintain a buffer of 256 MB that we clear
     # before each kernel call to make sure that the L2
     # doesn't contain any input data before the run
@@ -167,16 +170,8 @@ def do_bench_kineto(
     else:
         clear_cache = lambda *args: None
 
-    # Estimate the runtime of the function
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-    for _ in range(5):
-        clear_cache()
-        fn()
-    end_event.record()
-    torch.cuda.synchronize()
-    estimate_ms = start_event.elapsed_time(end_event) / 5
+    estimate_ms = estimate_cuda_runtime_ms(fn, clear_cache_fn=clear_cache)
+    warmup, rep = resolve_warmup_and_rep(warmup, rep, estimate_ms)
 
     # Calculate number of iterations based on target rep time
     if estimate_ms == 0:
diff --git a/tritonbench/components/ncu/__init__.py b/tritonbench/components/ncu/__init__.py
@@ -1,6 +1,10 @@
 from typing import Callable
 
 import torch
+from tritonbench.components.do_bench.utils import (
+    estimate_cuda_runtime_ms,
+    resolve_warmup_and_rep,
+)
 
 
 class cuda_profiler_range:
@@ -40,20 +44,12 @@ def do_bench_in_task(
 
     cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
 
-    if warmup:
-        # Estimate the runtime of the function
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        start_event.record()
-        for _ in range(5):
-            cache.zero_()
-            fn()
-        end_event.record()
-        torch.cuda.synchronize()
-        estimate_ms = start_event.elapsed_time(end_event) / 5
+    if warmup == True:
+        estimate_ms = estimate_cuda_runtime_ms(fn, clear_cache_fn=cache.zero_)
+        warmup, _ = resolve_warmup_and_rep(warmup, None, estimate_ms)
 
         # compute number of warmup and repeat
-        n_warmup = max(1, int(warmup / estimate_ms))
+        n_warmup = 1 if estimate_ms == 0 else max(1, int(warmup / estimate_ms))
         # Warm-up
         for _ in range(n_warmup):
             fn()
diff --git a/tritonbench/utils/constants.py b/tritonbench/utils/constants.py
@@ -1,5 +1,10 @@
-DEFAULT_WARMUP = 3000
-DEFAULT_REP = 3000
+from typing import Dict, Tuple
+
+DEFAULT_WARMUP_REP_BY_ESTIMATED_KERNEL_MS: Dict[str, Tuple[int, int]] = {
+    "1": (100, 100),
+    "10": (1000, 1000),
+    "100": (3000, 3000),
+}
 DEFAULT_POWER_REPCNT = 2000
 DEFAULT_QUANTILES = [0.5, 0.1, 0.9]
 DEFAULT_SLEEP = 0.0
diff --git a/tritonbench/utils/parser.py b/tritonbench/utils/parser.py
@@ -7,8 +7,6 @@
     DEFAULT_ENTROPY_MAX_SAMPLES,
     DEFAULT_ENTROPY_MIN_R2,
     DEFAULT_ENTROPY_WINDOW_SIZE,
-    DEFAULT_REP,
-    DEFAULT_WARMUP,
 )
 from tritonbench.utils.env_utils import AVAILABLE_PRECISIONS, is_fbcode
 from tritonbench.utils.gpu_utils import get_gpu_device_name
@@ -76,14 +74,14 @@ def get_parser(args=None):
     parser.add_argument(
         "--warmup",
         type=int,
-        default=DEFAULT_WARMUP,
-        help="Num of warmup runs for each benchmark run.",
+        default=None,
+        help="Warmup time in ms for each benchmark run. Default: auto by estimated kernel latency.",
     )
     parser.add_argument(
         "--rep",
         type=int,
-        default=DEFAULT_REP,
-        help="The rep time for each benchmark run.",
+        default=None,
+        help="Target measurement time in ms for each benchmark run. Default: auto by estimated kernel latency.",
     )
     parser.add_argument(
         "--autotune-warmup",
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py
@@ -29,15 +29,19 @@
 from torch.utils._pytree import tree_map
 from triton.runtime.errors import OutOfResources as TritonOutOfResources
 from tritonbench.components.do_bench import do_bench_wrapper, Latency
+from tritonbench.components.do_bench.utils import (
+    estimate_cuda_runtime_ms,
+    resolve_warmup_and_rep,
+)
 from tritonbench.components.export import export_data
 from tritonbench.components.power import PowerManagerTask
 from tritonbench.data import get_input_loader
 from tritonbench.utils.constants import (
+    DEFAULT_N_REP,
+    DEFAULT_N_WARMUP,
     DEFAULT_POWER_REPCNT,
     DEFAULT_QUANTILES,
-    DEFAULT_REP,
     DEFAULT_SLEEP,
-    DEFAULT_WARMUP,
 )
 from tritonbench.utils.cudagraph_utils import CudaGraphConfig, CudaGraphError
 from tritonbench.utils.diode_utils import (
@@ -159,7 +163,7 @@ def __exit__(self, *args, **kwargs):
             self.elapsed_ms = (end_time - self._start_time) * 1e3
 
 
-def do_bench_walltime(fn, warmup=25, rep=DEFAULT_REP):
+def do_bench_walltime(fn, warmup=None, rep=None):
     fn()
     torch.cuda.synchronize()
 
@@ -168,10 +172,15 @@ def do_bench_walltime(fn, warmup=25, rep=DEFAULT_REP):
             fn()
         torch.cuda.synchronize()
     estimate_ms = timer.elapsed_ms / 5
+    warmup, rep = resolve_warmup_and_rep(warmup, rep, estimate_ms)
 
     # compute number of warmup and repeat
-    n_warmup = max(1, int(warmup / estimate_ms))
-    n_repeat = max(1, int(rep / estimate_ms))
+    if estimate_ms == 0:
+        n_warmup = DEFAULT_N_WARMUP
+        n_repeat = DEFAULT_N_REP
+    else:
+        n_warmup = max(1, int(warmup / estimate_ms))
+        n_repeat = max(1, int(rep / estimate_ms))
 
     # Warm-up
     for _ in range(n_warmup):
@@ -1034,8 +1043,8 @@ def benchmark_fn():
 
     def run(
         self,
-        warmup=DEFAULT_WARMUP,
-        rep=DEFAULT_REP,
+        warmup: int | None = None,
+        rep: int | None = None,
         quantiles=DEFAULT_QUANTILES,
         sleep=DEFAULT_SLEEP,
     ) -> None:
@@ -1901,8 +1910,8 @@ def _do_bench(
         self,
         input_id: int,
         fn_name: str,
-        warmup=DEFAULT_WARMUP,
-        rep=DEFAULT_REP,
+        warmup: int | None,
+        rep: int | None,
         repcnt=None,
         quantiles=DEFAULT_QUANTILES,
         baseline: bool = False,