Fix autotuner oom (flashinfer-ai#2442)

zack041 · raayandhar · commit 7471ad04d682 · 2026-02-04T23:09:10.000-08:00
## 📌 Description Add graceful OOM handling during autotuning. When `torch.cuda.OutOfMemoryError` occurs, the autotuner now clears CUDA cache and falls back to the default tactic `(runners[0], -1)` instead of crashing. The try-except block wraps the entire profiling loop, covering methods like `_prepare_input_tensors()` that could also cause OOM. OOM from the inner profiling loop is raised to be caught by the outer exception handler. ## 🔍 Related Issues Fixes flashinfer-ai#2357 ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes No tests added because OOM during autotuning is difficult to reliably reproduce in a test environment.  ## Summary by CodeRabbit * **Bug Fixes** * Improved profiling error handling so individual tactic failures are caught, logged, recorded, and do not abort tuning. * Added robust out-of-memory handling that clears GPU resources and falls back to safe/previous configurations instead of crashing. * Ensured tuning continues after non‑OOM errors, preserves cache/metrics consistency, and still selects the best measured configuration when available. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -466,64 +466,81 @@ def choose_one(
             }
 
         for p in profiles:
-            tensors = self._prepare_input_tensors(p, inputs)
-            is_cache_hit, runner_id, tactic, _ = self.search_cache(
-                custom_op, runners, p.get_opt_shapes(), tuning_config
-            )
-            if not is_cache_hit:
-                min_time = float("inf")
-                # Initialize runner and tactic as None in case of no valid tactic or runners are found
-                runner_id, tactic = None, None
-                for r_id, r in enumerate(runners):
-                    # TODO: use FakeTensor here.
-                    valid_tactics = r.get_valid_tactics(tensors, p)
-                    runner_arg_names = runner_arg_names_map[r]
-                    if "do_preparation" in runner_arg_names and len(valid_tactics) > 0:
-                        r(tensors, tactic=-1, do_preparation=True, **kwargs)
-                    for tac in valid_tactics:
-                        try:
-                            time_measured = self._profile_single_kernel(
-                                r, tensors, tac, **kwargs
-                            )
-                        except Exception as e:
-                            shapes = self._get_input_sizes(tensors)
-                            logger.warning(
-                                f"[Autotuner]: Skipping tactic {r} {tac}, due to failure while profiling: {e}"
-                            )
-
-                            # Log stacktrace as debug to not spam log
-                            logger.debug(
-                                f"[Autotuner]: Failed when profiling {r} {tac}, shapes={shapes}. Error occurred: {e}"
-                            )
-
-                            # Record the failed profiling combinations
-                            if custom_op not in self.stats.failed_profiling_count:
-                                self.stats.failed_profiling_count[custom_op] = set()
-                            self.stats.failed_profiling_count[custom_op].add(
-                                AutoTuner._get_cache_key(
-                                    custom_op, r, p.get_opt_shapes(), tuning_config
+            try:
+                tensors = self._prepare_input_tensors(p, inputs)
+                is_cache_hit, runner_id, tactic, _ = self.search_cache(
+                    custom_op, runners, p.get_opt_shapes(), tuning_config
+                )
+                if not is_cache_hit:
+                    min_time = float("inf")
+                    # Initialize runner and tactic as None in case of no valid tactic or runners are found
+                    runner_id, tactic = None, None
+                    for r_id, r in enumerate(runners):
+                        # TODO: use FakeTensor here.
+                        valid_tactics = r.get_valid_tactics(tensors, p)
+                        runner_arg_names = runner_arg_names_map[r]
+                        if (
+                            "do_preparation" in runner_arg_names
+                            and len(valid_tactics) > 0
+                        ):
+                            r(tensors, tactic=-1, do_preparation=True, **kwargs)
+                        for tac in valid_tactics:
+                            try:
+                                time_measured = self._profile_single_kernel(
+                                    r, tensors, tac, **kwargs
                                 )
-                            )
-
-                            # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
-                            # or some runtime error occurs during profiling.
-                            time_measured = float("inf")
-                        if time_measured < min_time:
-                            min_time = time_measured
-                            runner_id, tactic = r_id, tac
-                if runner_id is not None:
-                    # At least one valid (runner, tactic) pair is found
-                    cache_key = AutoTuner._get_cache_key(
-                        custom_op, runners[runner_id], p.get_opt_shapes(), tuning_config
-                    )
-                    # inspect call stack
-                    self.profiling_cache[cache_key] = (runner_id, tactic, p)
-                    self.stats.tuned_op_successful_configs[custom_op] = (
-                        self.stats.tuned_op_successful_configs.get(custom_op, 0) + 1
-                    )
-                    logger.debug(
-                        f"[Autotuner]: profiling chosen runner: {runners[runner_id]} {tactic} for {cache_key}"
-                    )
+                            except torch.cuda.OutOfMemoryError:
+                                raise
+                            except Exception as e:
+                                shapes = self._get_input_sizes(tensors)
+                                logger.warning(
+                                    f"[Autotuner]: Skipping tactic {r} {tac}, due to failure while profiling: {e}"
+                                )
+
+                                # Log stacktrace as debug to not spam log
+                                logger.debug(
+                                    f"[Autotuner]: Failed when profiling {r} {tac}, shapes={shapes}. Error occurred: {e}"
+                                )
+
+                                # Record the failed profiling combinations
+                                if custom_op not in self.stats.failed_profiling_count:
+                                    self.stats.failed_profiling_count[custom_op] = set()
+                                self.stats.failed_profiling_count[custom_op].add(
+                                    AutoTuner._get_cache_key(
+                                        custom_op, r, p.get_opt_shapes(), tuning_config
+                                    )
+                                )
+
+                                # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
+                                # or some runtime error occurs during profiling.
+                                time_measured = float("inf")
+                            if time_measured < min_time:
+                                min_time = time_measured
+                                runner_id, tactic = r_id, tac
+
+                    if runner_id is not None:
+                        # At least one valid (runner, tactic) pair is found
+                        cache_key = AutoTuner._get_cache_key(
+                            custom_op,
+                            runners[runner_id],
+                            p.get_opt_shapes(),
+                            tuning_config,
+                        )
+                        # inspect call stack
+                        self.profiling_cache[cache_key] = (runner_id, tactic, p)
+                        self.stats.tuned_op_successful_configs[custom_op] = (
+                            self.stats.tuned_op_successful_configs.get(custom_op, 0) + 1
+                        )
+                        logger.debug(
+                            f"[Autotuner]: profiling chosen runner: {runners[runner_id]} {tactic} for {cache_key}"
+                        )
+
+            except torch.cuda.OutOfMemoryError:
+                torch.cuda.empty_cache()
+                logger.warning(
+                    "[Autotuner]: OOM detected, falling back to default tactic"
+                )
+                return runners[0], -1
 
         # Get the best runner and tactic from cache
         # If no valid tactic is found, the fallback runner and tactic will be used