From 7bc0e6300ab2c46ca14af2b86bbadf1362bf19c8 Mon Sep 17 00:00:00 2001 From: Keshav Vinayak Jha Date: Thu, 16 Apr 2026 05:58:17 +0000 Subject: [PATCH] Fix segfault in --verify-numerics on machines with >128 CPU cores Limit BLAS to a single thread inside compute_cpu_reference(), which exists for correctness not performance. Without this, OpenBLAS tries to spawn as many threads as there are CPU cores and exceeds its compiled-in limit (typically 128), causing a segfault. Fixes https://github.com/iree-org/iree-turbine/issues/1336 Co-Authored-By: Claude Opus 4.6 Signed-off-by: Keshav Vinayak Jha --- iree/turbine/kernel/boo/driver/numerics.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/iree/turbine/kernel/boo/driver/numerics.py b/iree/turbine/kernel/boo/driver/numerics.py index c041e011d..2844db2a9 100644 --- a/iree/turbine/kernel/boo/driver/numerics.py +++ b/iree/turbine/kernel/boo/driver/numerics.py @@ -103,8 +103,18 @@ def compute_cpu_reference( reference_module = sig.get_nn_module(use_custom=False) - with torch.no_grad(): - result = reference_module(*ref_args) + # This function exists for correctness, not performance. Restrict BLAS + # to a single thread so that OpenBLAS (or MKL) does not try to spawn as + # many threads as there are CPU cores. On machines with >128 cores that + # exceeds OpenBLAS's compiled-in limit and segfaults. + # See https://github.com/iree-org/iree-turbine/issues/1336 + prev_threads = torch.get_num_threads() + torch.set_num_threads(1) + try: + with torch.no_grad(): + result = reference_module(*ref_args) + finally: + torch.set_num_threads(prev_threads) # Wrap single tensor in tuple if isinstance(result, torch.Tensor):