From 7bc0e6300ab2c46ca14af2b86bbadf1362bf19c8 Mon Sep 17 00:00:00 2001
From: Keshav Vinayak Jha <keshavvinayakjha@gmail.com>
Date: Thu, 16 Apr 2026 05:58:17 +0000
Subject: [PATCH] Fix segfault in --verify-numerics on machines with >128 CPU
 cores

Limit BLAS to a single thread inside compute_cpu_reference(), which
exists for correctness not performance.  Without this, OpenBLAS tries
to spawn as many threads as there are CPU cores and exceeds its
compiled-in limit (typically 128), causing a segfault.

Fixes https://github.com/iree-org/iree-turbine/issues/1336

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Keshav Vinayak Jha <keshavvinayakjha@gmail.com>
---
 iree/turbine/kernel/boo/driver/numerics.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/iree/turbine/kernel/boo/driver/numerics.py b/iree/turbine/kernel/boo/driver/numerics.py
index c041e011d..2844db2a9 100644
--- a/iree/turbine/kernel/boo/driver/numerics.py
+++ b/iree/turbine/kernel/boo/driver/numerics.py
@@ -103,8 +103,18 @@ def compute_cpu_reference(
 
     reference_module = sig.get_nn_module(use_custom=False)
 
-    with torch.no_grad():
-        result = reference_module(*ref_args)
+    # This function exists for correctness, not performance.  Restrict BLAS
+    # to a single thread so that OpenBLAS (or MKL) does not try to spawn as
+    # many threads as there are CPU cores.  On machines with >128 cores that
+    # exceeds OpenBLAS's compiled-in limit and segfaults.
+    # See https://github.com/iree-org/iree-turbine/issues/1336
+    prev_threads = torch.get_num_threads()
+    torch.set_num_threads(1)
+    try:
+        with torch.no_grad():
+            result = reference_module(*ref_args)
+    finally:
+        torch.set_num_threads(prev_threads)
 
     # Wrap single tensor in tuple
     if isinstance(result, torch.Tensor):