[FPSan] make tests much faster (#10016)

apgoucher · web-flow · commit eb263b5ae8a1 · 2026-04-14T07:46:12.000+01:00
The slowest tests in the suite previously took 200 seconds each and now
take 3.3 seconds each. The slowness was the reference implementation
using Python loops over scalar numpy code instead of vectorised numpy
code.

We also prune the number of test cases for tests with large Cartesian
product parametrisations.

Also, to minimise the amount of LLVM IR code generated by FPSan, we
emit `scf::for` loops for `sin`, `cos`, and `exp2` instead of unrolled
straight-line code: this makes sense as the principal bottleneck in
FPSan use is compile-time rather than runtime.
diff --git a/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp b/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp
@@ -443,6 +443,16 @@ Value castSignedIntValueToType(PatternRewriter &rewriter, Location loc, Value v,
   return v;
 }
 
+Value castScalarIntToIntLike(PatternRewriter &rewriter, Location loc,
+                             Value scalar, Type targetTy) {
+  auto elemTy = cast<IntegerType>(getElementType(targetTy));
+  if (scalar.getType() != elemTy)
+    scalar = castSignedIntValueToType(rewriter, loc, scalar, elemTy);
+  if (isa<ShapedType>(targetTy))
+    return tt::SplatOp::create(rewriter, loc, targetTy, scalar);
+  return scalar;
+}
+
 Value selectUIntConstantOnSign(PatternRewriter &rewriter, Location loc,
                                Value signSource, uint64_t signMaskValue,
                                uint64_t nonNegativeValue,
@@ -674,45 +684,60 @@ Value fpsanSRem(PatternRewriter &rewriter, Location loc, Value num, Value den) {
 
 // Modular exponentiation in payload space; this preserves
 // exp2(a + b) = exp2(a) * exp2(b) under the integer rewrite.
-Value fpsanExp2FromI32(PatternRewriter &rewriter, Location loc, Value xI,
+Value fpsanExp2FromInt(PatternRewriter &rewriter, Location loc, Value xI,
                        Type floatTy) {
+  unsigned bitWidth = getIntBitwidth(xI.getType());
   auto one = getIntConstantLike(rewriter, loc, xI.getType(), 1);
   auto zero = getIntConstantLike(rewriter, loc, xI.getType(), 0);
   auto c = getIntConstantLike(rewriter, loc, xI.getType(), 0xa343836d);
 
-  Value y = one;
-  for (int i = 0; i < 32; ++i) {
-    y = arith::MulIOp::create(rewriter, loc, y, y);
-    auto bit = getIntConstantLike(rewriter, loc, xI.getType(),
-                                  int64_t(1ull << (31 - i)));
-    auto masked = arith::AndIOp::create(rewriter, loc, xI, bit);
-    auto isZero = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
-                                        masked, zero);
-    auto factor = arith::SelectOp::create(rewriter, loc, isZero, one, c);
-    y = arith::MulIOp::create(rewriter, loc, y, factor);
-  }
-
-  return unembedToFloat(rewriter, loc, y, floatTy);
+  auto lower =
+      arith::ConstantOp::create(rewriter, loc, rewriter.getI32IntegerAttr(0));
+  auto upper = arith::ConstantOp::create(rewriter, loc,
+                                         rewriter.getI32IntegerAttr(bitWidth));
+  auto step =
+      arith::ConstantOp::create(rewriter, loc, rewriter.getI32IntegerAttr(1));
+  auto topBit = arith::ConstantOp::create(
+      rewriter, loc, rewriter.getI32IntegerAttr(bitWidth - 1));
+  auto loop = scf::ForOp::create(rewriter, loc, lower, upper, step, one);
+  rewriter.setInsertionPointToStart(loop.getBody());
+
+  Value i = loop.getInductionVar();
+  Value y = loop.getRegionIterArgs()[0];
+  y = arith::MulIOp::create(rewriter, loc, y, y);
+  Value bitIndex =
+      arith::SubIOp::create(rewriter, loc, rewriter.getI32Type(), topBit, i);
+  Value shift = castScalarIntToIntLike(rewriter, loc, bitIndex, xI.getType());
+  Value bit = arith::ShLIOp::create(rewriter, loc, one, shift);
+  auto masked = arith::AndIOp::create(rewriter, loc, xI, bit);
+  auto isZero = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                                      masked, zero);
+  auto factor = arith::SelectOp::create(rewriter, loc, isZero, one, c);
+  y = arith::MulIOp::create(rewriter, loc, y, factor);
+  scf::YieldOp::create(rewriter, loc, y);
+  rewriter.setInsertionPointAfter(loop);
+
+  return unembedToFloat(rewriter, loc, loop.getResult(0), floatTy);
 }
 
 Value fpsanExp2(PatternRewriter &rewriter, Location loc, Value input) {
   auto elemTy = dyn_cast<FloatType>(getElementType(input.getType()));
-  if (!elemTy || elemTy.getWidth() != 32)
+  if (!elemTy)
     return Value();
-  return fpsanExp2FromI32(rewriter, loc, embedToInt(rewriter, loc, input),
+  return fpsanExp2FromInt(rewriter, loc, embedToInt(rewriter, loc, input),
                           input.getType());
 }
 
 Value fpsanExp(PatternRewriter &rewriter, Location loc, Value input) {
   auto elemTy = dyn_cast<FloatType>(getElementType(input.getType()));
-  if (!elemTy || elemTy.getWidth() != 32)
+  if (!elemTy)
     return Value();
 
   auto inputI = embedToInt(rewriter, loc, input);
   auto rcpLog2 =
       getU32ConstantLike(rewriter, loc, inputI.getType(), 0x236ee9bfu);
   auto scaledI = arith::MulIOp::create(rewriter, loc, inputI, rcpLog2);
-  return fpsanExp2FromI32(rewriter, loc, scaledI, input.getType());
+  return fpsanExp2FromInt(rewriter, loc, scaledI, input.getType());
 }
 
 struct FpSanCosSin {
@@ -735,32 +760,47 @@ FpSanCosSin fpsanCosSinPayload(PatternRewriter &rewriter, Location loc,
   auto a = getUIntConstantLike(rewriter, loc, intTy, aValue);
   auto b = getUIntConstantLike(rewriter, loc, intTy, bValue);
 
-  Value c = one;
-  Value s = zero;
-  for (int bit = static_cast<int>(bitWidth) - 1; bit >= 0; --bit) {
-    Value cc = arith::MulIOp::create(rewriter, loc, c, c);
-    Value ss = arith::MulIOp::create(rewriter, loc, s, s);
-    Value cDouble = arith::SubIOp::create(rewriter, loc, cc, ss);
-    Value cs = arith::MulIOp::create(rewriter, loc, c, s);
-    Value sDouble = arith::MulIOp::create(rewriter, loc, two, cs);
-
-    Value ac = arith::MulIOp::create(rewriter, loc, a, cDouble);
-    Value bs = arith::MulIOp::create(rewriter, loc, b, sDouble);
-    Value cInc = arith::SubIOp::create(rewriter, loc, ac, bs);
-    Value as = arith::MulIOp::create(rewriter, loc, a, sDouble);
-    Value bc = arith::MulIOp::create(rewriter, loc, b, cDouble);
-    Value sInc = arith::AddIOp::create(rewriter, loc, as, bc);
-
-    auto bitMask =
-        getUIntConstantLike(rewriter, loc, intTy, uint64_t{1} << bit);
-    auto masked = arith::AndIOp::create(rewriter, loc, xI, bitMask);
-    auto isZero = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
-                                        masked, zero);
-    c = arith::SelectOp::create(rewriter, loc, isZero, cDouble, cInc);
-    s = arith::SelectOp::create(rewriter, loc, isZero, sDouble, sInc);
-  }
-
-  return {c, s};
+  auto lower =
+      arith::ConstantOp::create(rewriter, loc, rewriter.getI32IntegerAttr(0));
+  auto upper = arith::ConstantOp::create(rewriter, loc,
+                                         rewriter.getI32IntegerAttr(bitWidth));
+  auto step =
+      arith::ConstantOp::create(rewriter, loc, rewriter.getI32IntegerAttr(1));
+  auto topBit = arith::ConstantOp::create(
+      rewriter, loc, rewriter.getI32IntegerAttr(bitWidth - 1));
+  SmallVector<Value> initArgs{one, zero};
+  auto loop = scf::ForOp::create(rewriter, loc, lower, upper, step, initArgs);
+  rewriter.setInsertionPointToStart(loop.getBody());
+
+  Value bit = loop.getInductionVar();
+  Value c = loop.getRegionIterArgs()[0];
+  Value s = loop.getRegionIterArgs()[1];
+  Value cc = arith::MulIOp::create(rewriter, loc, c, c);
+  Value ss = arith::MulIOp::create(rewriter, loc, s, s);
+  Value cDouble = arith::SubIOp::create(rewriter, loc, cc, ss);
+  Value cs = arith::MulIOp::create(rewriter, loc, c, s);
+  Value sDouble = arith::MulIOp::create(rewriter, loc, two, cs);
+
+  Value ac = arith::MulIOp::create(rewriter, loc, a, cDouble);
+  Value bs = arith::MulIOp::create(rewriter, loc, b, sDouble);
+  Value cInc = arith::SubIOp::create(rewriter, loc, ac, bs);
+  Value as = arith::MulIOp::create(rewriter, loc, a, sDouble);
+  Value bc = arith::MulIOp::create(rewriter, loc, b, cDouble);
+  Value sInc = arith::AddIOp::create(rewriter, loc, as, bc);
+
+  Value bitIndex =
+      arith::SubIOp::create(rewriter, loc, rewriter.getI32Type(), topBit, bit);
+  Value shift = castScalarIntToIntLike(rewriter, loc, bitIndex, intTy);
+  Value bitMask = arith::ShLIOp::create(rewriter, loc, one, shift);
+  auto masked = arith::AndIOp::create(rewriter, loc, xI, bitMask);
+  auto isZero = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                                      masked, zero);
+  c = arith::SelectOp::create(rewriter, loc, isZero, cDouble, cInc);
+  s = arith::SelectOp::create(rewriter, loc, isZero, sDouble, sInc);
+  scf::YieldOp::create(rewriter, loc, ValueRange{c, s});
+  rewriter.setInsertionPointAfter(loop);
+
+  return {loop.getResult(0), loop.getResult(1)};
 }
 
 Value fpsanCos(PatternRewriter &rewriter, Location loc, Value input) {
diff --git a/python/test/conftest.py b/python/test/conftest.py
@@ -1,6 +1,23 @@
-import pytest
+from collections import defaultdict
+import hashlib
 import tempfile
 
+import pytest
+
+
+def _top_level_test_key(item):
+    nodeid = item.nodeid
+    bracket = nodeid.find("[")
+    return nodeid if bracket == -1 else nodeid[:bracket]
+
+
+def _case_key(item):
+    return item.name
+
+
+def _sha256_hex(s: str) -> str:
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+
 
 def pytest_configure(config):
     # If pytest-sugar is not active, enable instafail
@@ -10,6 +27,35 @@ def pytest_configure(config):
 
 def pytest_addoption(parser):
     parser.addoption("--device", action="store", default="cuda")
+    parser.addoption(
+        "--max-cases-per-test",
+        action="store",
+        type=int,
+        default=100,
+        help="Maximum number of cases per top-level test",
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    max_cases = config.getoption("--max-cases-per-test")
+    if max_cases <= 0:
+        return
+
+    groups = defaultdict(list)
+    for item in items:
+        groups[_top_level_test_key(item)].append(item)
+
+    kept = []
+    deselected = []
+    for group in groups.values():
+        ordered = sorted(group, key=lambda item: _sha256_hex(_case_key(item)))
+        kept.extend(ordered[:max_cases])
+        deselected.extend(ordered[max_cases:])
+
+    if deselected:
+        config.hook.pytest_deselected(items=deselected)
+
+    items[:] = kept
 
 
 @pytest.fixture
diff --git a/python/test/gluon/test_fpsan.py b/python/test/gluon/test_fpsan.py
@@ -1333,31 +1333,47 @@ def _mm_scaled_payload_u32(a_u8: np.ndarray, b_u8: np.ndarray, a_scale_u8: np.nd
     assert a_scale.shape == (m, k // 32)
     assert b_scale.shape == (n, k // 32)
 
-    def unpack(data: np.ndarray, row: int, col: int, pack: int, pack_axis: int) -> np.uint16:
+    def unpack_payload_matrix(data: np.ndarray, pack: int, pack_axis: int) -> np.ndarray:
         if pack == 1:
-            return np.uint16(data[row, col])
-        return np.uint16(_unpack_element(data, row, col, pack, pack_axis=pack_axis))
+            return data.astype(np.uint64)
+        assert pack == 2
+        if pack_axis == 1:
+            out = np.empty((data.shape[0], data.shape[1] * pack), dtype=np.uint64)
+            out[:, 0::2] = data.astype(np.uint64) & np.uint64(0x0F)
+            out[:, 1::2] = (data.astype(np.uint64) >> np.uint64(4)) & np.uint64(0x0F)
+            return out
+        out = np.empty((data.shape[0] * pack, data.shape[1]), dtype=np.uint64)
+        out[0::2, :] = data.astype(np.uint64) & np.uint64(0x0F)
+        out[1::2, :] = (data.astype(np.uint64) >> np.uint64(4)) & np.uint64(0x0F)
+        return out
+
+    def compute_payload_matrix(data: np.ndarray) -> np.ndarray:
+        if elem_type in ("e4m3", "e5m2"):
+            one_bits = 0x38 if elem_type == "e4m3" else 0x3C
+            payload = _mix_float_bits_to_payload_u64(data, 8, one_bits)
+            return _signed_cast_payload_u64(payload, 8, 16)
+        return data & np.uint64(0xFFFF)
+
+    def scale_payload_matrix(raw_scale: np.ndarray) -> np.ndarray:
+        raw_bf16 = (raw_scale & np.uint64(0xFF)) << np.uint64(7)
+        return _mix_float_bits_to_payload_u64(raw_bf16, 16, 0x3F80)
 
-    out = np.empty((m, n), dtype=np.uint64)
-    compute_type = "bf16"
+    a_payload = compute_payload_matrix(unpack_payload_matrix(a_u8, a_pack, pack_axis=1))
+    b_payload = compute_payload_matrix(unpack_payload_matrix(b_u8, b_pack, pack_axis=0))
+    a_scale_payload = scale_payload_matrix(a_scale)
+    b_scale_payload = scale_payload_matrix(b_scale)
+
+    out = c_u.copy() if c_u is not None else np.zeros((m, n), dtype=np.uint64)
     compute_mask = np.uint64(0xFFFF)
     mask32 = np.uint64(0xFFFFFFFF)
-    for i in range(m):
-        for j in range(n):
-            s = c_u[i, j] if c_u is not None else 0
-            for kk in range(k):
-                a_val = unpack(a_u8, i, kk, a_pack, pack_axis=1)
-                b_val = unpack(b_u8, kk, j, b_pack, pack_axis=0)
-                a_val = _dot_scaled_compute_payload_elem(np.uint64(a_val), elem_type, compute_type)
-                b_val = _dot_scaled_compute_payload_elem(np.uint64(b_val), elem_type, compute_type)
-                a_scale_val = _dot_scaled_scale_payload(a_scale[i, kk // 32], compute_type)
-                b_scale_val = _dot_scaled_scale_payload(b_scale[j, kk // 32], compute_type)
-                lhs = (a_val * a_scale_val) & compute_mask
-                rhs = (b_val * b_scale_val) & compute_mask
-                lhs = _signed_cast_payload_scalar(lhs, 16, 32)
-                rhs = _signed_cast_payload_scalar(rhs, 16, 32)
-                s = (s + ((np.uint64(lhs) * np.uint64(rhs)) & mask32)) & mask32
-            out[i, j] = s
+    for group in range(k // 32):
+        start = group * 32
+        end = start + 32
+        lhs = (a_payload[:, start:end] * a_scale_payload[:, group:group + 1]) & compute_mask
+        rhs = (b_payload[start:end, :] * b_scale_payload[:, group][None, :]) & compute_mask
+        lhs = _signed_cast_payload_u64(lhs, 16, 32)
+        rhs = _signed_cast_payload_u64(rhs, 16, 32)
+        out = (out + (lhs @ rhs)) & mask32
     return _unmix_payload_u32_to_f32_bits_i32(out.astype(np.uint32))
 
 
@@ -1759,31 +1775,33 @@ def test_reduction(device, fresh_knobs):
     _require_cuda_backend(device)
 
     @triton.jit
-    def reduce_kernel(a_ptr, c_ptr, M: tl.constexpr, N: tl.constexpr, stride_am: tl.constexpr, stride_ak: tl.constexpr,
-                      ORDER: tl.constexpr):
-        a_ptrs = a_ptr + (tl.arange(0, M)[:, None] * stride_am + (tl.arange(0, N)[None, :]) * stride_ak)
+    def reduce_kernel(a_ptr, c_ptr, M: tl.constexpr, N: tl.constexpr, stride_ak: tl.constexpr, stride_am: tl.constexpr,
+                      stride_an: tl.constexpr, ORDER: tl.constexpr):
+
+        a_ptr += tl.program_id(0).to(tl.int64) * stride_ak
+        c_ptr += tl.program_id(0).to(tl.int64)
+        a_ptrs = a_ptr + (tl.arange(0, M)[:, None] * stride_am + (tl.arange(0, N)[None, :]) * stride_an)
         a = tl.load(a_ptrs)
         r1 = tl.sum(a, axis=ORDER)
-        r2 = tl.sum(r1, axis=ORDER - 1)
+        r2 = tl.sum(r1, axis=0)
         tl.store(c_ptr, r2)
 
-    M, N = 512, 512
+    # we run K parallel tests so as to make non-associativity much more
+    # likely to manifest:
+    K, M, N = 100, 128, 128
     torch.manual_seed(0)
-    a = torch.randn((M, N), dtype=torch.float32, device="cuda")
-    # Make non-associativity visible and deterministic: large + tiny magnitudes.
-    a[:, :64] *= 1e10
-    a[:, 64:] *= 1e-10
-    c1 = torch.empty((1, ), dtype=torch.float32).to('cuda')
-    c2 = torch.empty((1, ), dtype=torch.float32).to('cuda')
-
-    reduce_kernel[(1, )](a, c1, M=M, N=N, stride_am=a.stride(0), stride_ak=a.stride(1), ORDER=0)
-    reduce_kernel[(1, )](a, c2, M=M, N=N, stride_am=a.stride(0), stride_ak=a.stride(1), ORDER=1)
+    a = torch.randn((K, M, N), dtype=torch.float32, device="cuda")
+    c1 = torch.empty((K, ), dtype=torch.float32).to('cuda')
+    c2 = torch.empty((K, ), dtype=torch.float32).to('cuda')
+
+    reduce_kernel[(K, )](a, c1, M, N, a.stride(0), a.stride(1), a.stride(2), ORDER=0)
+    reduce_kernel[(K, )](a, c2, M, N, a.stride(0), a.stride(1), a.stride(2), ORDER=1)
     assert not _payload_equal(c1, c2)
 
     fresh_knobs.compilation.instrumentation_mode = "fpsan"
 
-    reduce_kernel[(1, )](a, c1, M=M, N=N, stride_am=a.stride(0), stride_ak=a.stride(1), ORDER=0)
-    reduce_kernel[(1, )](a, c2, M=M, N=N, stride_am=a.stride(0), stride_ak=a.stride(1), ORDER=1)
+    reduce_kernel[(K, )](a, c1, M, N, a.stride(0), a.stride(1), a.stride(2), ORDER=0)
+    reduce_kernel[(K, )](a, c2, M, N, a.stride(0), a.stride(1), a.stride(2), ORDER=1)
     assert _payload_equal(c1, c2)