cleanup

ptillet · ptillet · commit 3fd1ef857ed4 · 2026-02-07T12:43:10.000-08:00
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -381,7 +381,6 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
     # --- create precision config ---
     wrap_list = lambda vals: torch.tensor(vals, dtype=torch.float32, device=device)
     c_scale_global = wrap_list([4.00]) if c_dtype.has_global_scale else None
-    c_absmax = wrap_list([0]) if c_dtype.has_global_scale else None
     precision_opt = PrecisionConfig(
         acc_scale=2.0 if c_dtype.has_global_scale or b_dtype.has_global_scale else 1.0,
         out_dtype=c_dtype.torch_dtype,
@@ -402,23 +401,18 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
         tri_y = matmul(a, b, bias,
                            a_ragged_metadata, b_ragged_metadata,
                            gather_indx, scatter_indx, precision_opt,
-                           gammas=gammas, epilogue=epilogue, c=c, c_absmax=c_absmax,
+                           gammas=gammas, epilogue=epilogue, c=c,
                            fused_activation=fused_activation)
-        if c_dtype.has_global_scale:
-            tri_y_scale = c_absmax.clone()
     except (opt_flags.InapplicableConstraint, NotImplementedError) as e:
         pytest.skip(f"inapplicable opt_flags constraint {e}")
     # --- torch implementation ---
     ref_y = matmul_torch(a, b, bias,  #
                         a_ragged_metadata, b_ragged_metadata,
                         gather_indx, scatter_indx, precision_opt,
                         gammas=gammas,
-                        c=c,
-                        c_absmax=c_absmax)
+                        c=c)
     if swiglu_opts is not None:
         ref_y = swiglu(ref_y, alpha=swiglu_opts[0], precision_config=SwiGLUPrecisionConfig(swiglu_opts[1]))
-    if c_dtype.has_global_scale:
-        ref_y_scale = c_absmax.clone()
 
     # --- check results ---
     if c_dtype.has_mx_scale:
@@ -430,9 +424,6 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
     elif b_dtype.is_mxfloat4:
         maxtol, rmstol = 3e-2, None
     assert_close(ref_y, tri_y, maxtol=maxtol, rmstol=rmstol)
-    if c_dtype.has_global_scale:
-        assert torch.all((ref_y_scale - tri_y_scale).abs() < 1e-10), \
-               f"ref_y_scale: {ref_y_scale}, tri_y_scale: {tri_y_scale.item()}"
 
 
 def test_set_idle_sms():
diff --git a/python/triton_kernels/tests/test_matmul_details/test_opt_flags_split_k.py b/python/triton_kernels/tests/test_matmul_details/test_opt_flags_split_k.py
@@ -44,7 +44,7 @@ def setup_nvidia(monkeypatch):
     monkeypatch.setattr(
         opt_flags.opt_flags_nvidia,
         "compute_block_n",
-        lambda n, arch, precision_config: (64, 32),
+        lambda n, arch, precision_config, **kwargs: (64, 32),
     )
     monkeypatch.setattr(
         opt_flags.opt_flags_nvidia,
@@ -54,7 +54,7 @@ def setup_nvidia(monkeypatch):
     monkeypatch.setattr(
         opt_flags.opt_flags_nvidia,
         "compute_block_k",
-        lambda m, k, is_persistent, lhs_dtype, rhs_dtype, precision_config, has_y_acc_in: 32,
+        lambda m, k, is_persistent, lhs_dtype, rhs_dtype, precision_config, has_y_acc_in, **kwargs: 32,
     )
     monkeypatch.setattr(
         opt_flags.opt_flags_nvidia,
@@ -69,7 +69,7 @@ def setup_nvidia(monkeypatch):
     monkeypatch.setattr(
         opt_flags.opt_flags_nvidia,
         "compute_num_warps",
-        lambda block_m, block_n, is_persistent, precision_config, constraints: 4,
+        lambda block_m, block_n, is_persistent, precision_config, constraints, **kwargs: 4,
     )
 
     fake_target = types.SimpleNamespace(backend="cuda", arch=100)
diff --git a/python/triton_kernels/tests/test_reduce.py b/python/triton_kernels/tests/test_reduce.py
@@ -70,19 +70,10 @@ def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn):
     device = "cuda"
     x = torch.randn((B, M, N), device=device, dtype=torch.float32, requires_grad=True)
     x_scale_mx, x_scale_global = None, None
-    y_scale_global, y_absmax_tri, y_absmax_ref = None, None, None
+    y_scale_global = None
     if is_mx := dtype_str.startswith("mx"):
         dtype = dtype_str_to_torch(dtype_str.removeprefix("mx"))
         x, x_scale_mx = downcast_to_mxfp_torch(x.to(torch.float16), dtype, axis=-1)
-    if is_flex := dtype_str.startswith("flex"):
-        dtype = dtype_str_to_torch(dtype_str.removeprefix("flex"))
-        expected_scale = torch.tensor([4], device=device, dtype=torch.float32)
-        x_scale_global = torch.tensor([2], device=device, dtype=torch.float32)
-        x = x / x_scale_global
-        x = x.to(dtype)
-        y_scale_global = expected_scale
-        y_absmax_tri = torch.zeros_like(expected_scale)
-        y_absmax_ref = torch.zeros_like(expected_scale)
     mask = init_mask(mask_mode, B, M, N, device)
     expected_exception = ValueError if dim == 2 and is_mx else None
     if expected_exception is not None:
@@ -105,7 +96,6 @@ def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn):
         x_scale_mx=x_scale_mx,
         x_scale_global=x_scale_global,
         y_scale_global=y_scale_global,
-        y_absmax=y_absmax_tri,
         postprocess_fn1=postprocess_fn_tri,
     )
     y_ref, y_ref_mxscale = reduce_torch(
@@ -115,15 +105,12 @@ def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn):
         x_scale_mx=x_scale_mx,
         x_scale_global=x_scale_global,
         y_scale_global=y_scale_global,
-        y_absmax=y_absmax_ref,
         postprocess_fn1=postprocess_fn_ref,
     )
     if is_mx:
         y_ref = upcast_from_mxfp_torch(y_ref, y_ref_mxscale, torch.float16, axis=-1)
         y_tri = upcast_from_mxfp_torch(y_tri, y_tri_mxscale, torch.float16, axis=-1)
     assert torch.allclose(y_tri.float(), y_ref.float(), atol=1e-3, rtol=1e-3)
-    if is_flex:
-        assert torch.allclose(y_absmax_tri, y_absmax_ref, atol=1e-3, rtol=1e-3)
     run_bwd = postprocess_fn is None and "float8" not in dtype_str
     if run_bwd:
         dy = torch.randn_like(y_tri)
diff --git a/python/triton_kernels/triton_kernels/matmul.py b/python/triton_kernels/triton_kernels/matmul.py
@@ -223,7 +223,6 @@ def matmul(a, b, bias,
     gammas: torch.Tensor | None = None,
     out_alpha: float | None = None,
     c: torch.Tensor | Tensor | None = None,
-    c_absmax: torch.Tensor | None = None,
     fused_comm: FusedComm | None = None,
     fused_activation: FusedActivation | None = None,
     epilogue: Epilogue | None = None,
@@ -491,12 +490,11 @@ def matmul(a, b, bias,
     } if fused_comm is not None else {}
     n_valid_slices = b_tensor_or_tma.shape[0] if ragged_dimension == "M" else n_slices
     # split-k scratchpad is fp32/fp16 accumulation, not the final output dtype.
-    # output flex scaling is applied in the reduce step.
+    # output scaling is applied in the reduce step.
     out_global_scale = None if has_scratchpad else c_scale_global
-    out_absmax = None if has_scratchpad else c_absmax
     (kernels._p_matmul if opt_flags.is_persistent else kernels._matmul)[(grid,)](
                    c_tensor_or_tma, c.storage.data, *out_matmul.stride(),
-                   *((None, out_matmul_scale, None) if out_matmul_has_mx else (out_global_scale, out_absmax, None)),
+                   *((None, out_matmul_scale, None) if out_matmul_has_mx else (out_global_scale, None, None)),
                    *out_matmul_scale_strides[-4:],
                    a_tensor_or_tma, a.storage.data, *a_strides, a_transpose,
                    a.scale_global,
@@ -564,8 +562,6 @@ def matmul(a, b, bias,
             y = memory["output"].view(-1, memory["output"].shape[-1]),
             y_dtype = memory["output"].dtype,
             y_scale_global = c_scale_global,
-            y_absmax = c_absmax,
-            y_saturate_inf = precision_config.flexpoint_saturate_inf,
             y_has_mx = c_scale_mx is not None,
             # fused functions
             postprocess_fn1 = postprocess_fn1,
@@ -639,17 +635,6 @@ def scale(val, scal):
         assert val.ndim == 3
         return val / scal[:, None, None]
 
-def compute_actual_scale(x, dtype, per_batch_scale=False):
-    from triton_kernels.numerics import MAX_FINITE_FLOAT8E4B8, MAX_FINITE_FLOAT8E4NV, MAX_FINITE_FLOAT8E5
-    max_finite = {
-        torch.float8_e5m2: MAX_FINITE_FLOAT8E5,
-        torch.float8_e4m3fn: MAX_FINITE_FLOAT8E4NV,
-        torch.float8_e4m3fnuz: MAX_FINITE_FLOAT8E4B8,
-    }[dtype]
-    maxvals = x.abs().amax(dim=tuple(range(1, x.ndim))) if per_batch_scale else x.abs().max()
-    return maxvals / max_finite
-
-
 def matmul_torch(a, b, bias,
                  a_ragged_metadata: RaggedTensorMetadata | None = None,
                  b_ragged_metadata: RaggedTensorMetadata | None = None,
@@ -660,7 +645,6 @@ def matmul_torch(a, b, bias,
                  gammas = None,
                  round_x = None, round_y = None,
                  c: torch.Tensor | Tensor | None = None,
-                 c_absmax: torch.Tensor | None = None,
                  ):
     if precision_config is None:
         precision_config = PrecisionConfig()
@@ -696,8 +680,6 @@ def matmul_torch(a, b, bias,
                 round_y=round_y,
             )
             out[expt] = out_expt.to(out.dtype)
-        if c_absmax is not None:
-            c_absmax.copy_(compute_actual_scale(out, precision_config.out_dtype))
         return scale(out, None if c is None else c.scale_global)
 
     is_input_batched = a.ndim == 3
@@ -748,8 +730,6 @@ def matmul_torch(a, b, bias,
         out = torch.zeros((scatter_indx.shape[0], y.shape[-1]), dtype=y.dtype, device=a.device)
         msk = scatter_indx != -1
         out[scatter_indx[msk], :] = y[msk, :]
-    if c_absmax is not None:
-        c_absmax.copy_(compute_actual_scale(out, precision_config.out_dtype))
     return scale(out, None if c is None else c.scale_global)
 
 
diff --git a/python/triton_kernels/triton_kernels/matmul_details/_matmul.py b/python/triton_kernels/triton_kernels/matmul_details/_matmul.py
@@ -6,7 +6,6 @@
 from triton_kernels.tensor_details.layout_details.hopper_scale import unswizzle_mxfp4_scale_hopper
 from triton_kernels.tensor_details.layout_details.hopper_value import mxfp4_to_bf16_triton
 from triton_kernels.tensor_details.layout_details.cdna4_scale import unswizzle_mx_scale_cdna4
-from triton_kernels.numerics_details.flexpoint import float_to_flex, load_scale
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 from triton_kernels.target_info import cuda_capability_geq
 from ._common import (
@@ -23,6 +22,11 @@ def round_f32_to_tf32(x: tl.tensor):
     ASM: tl.constexpr = "cvt.rna.tf32.f32 $0, $1;"
     return tl.inline_asm_elementwise(ASM, "=r, r", [x], dtype=tl.float32, is_pure=True, pack=1)
 
+
+@triton.jit
+def load_scale(scale_ptr):
+    return 1.0 if scale_ptr is None else tl.load(scale_ptr)
+
 _matmul_repr = make_matmul_repr("_matmul", [0, 1, 2])
 @triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"],
             repr=_matmul_repr, launch_metadata=matmul_launch_metadata)
@@ -483,8 +487,8 @@ def _matmul(
     else:
         if PER_BATCH_OUT_SCALE:
             YExpectedScale = YExpectedScale + start_z_out
-            YActualScale = YActualScale + start_z_out
-        out = float_to_flex(out, YExpectedScale, YActualScale, YChecksumScale, mask, Y, FLEXPOINT_SATURATE_INF)
+        if YExpectedScale is not None:
+            out = out / load_scale(YExpectedScale)
         if EPILOGUE_FN is not None and not IS_EPILOGUE_QUANT_MXFP8:
             out = EPILOGUE_FN(out, *epilogue_fn_args, target_dtype=YPtrs.dtype.element_ty)
     if pYPtrs is None:
diff --git a/python/triton_kernels/triton_kernels/matmul_details/_p_matmul.py b/python/triton_kernels/triton_kernels/matmul_details/_p_matmul.py
@@ -7,12 +7,6 @@
 from triton.tools.ragged_tma import load_ragged, store_ragged
 from triton_kernels import target_info
 from triton_kernels.tensor_details.layout_details.blackwell_scale import unswizzle_mx_scale_bw, unswizzle_act_mx_scale_bw
-from triton_kernels.numerics_details.flexpoint import (
-    float_to_flex,
-    load_scale,
-    nan_propagating_absmax_reduce,
-    compute_scale,
-)
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 from triton_kernels.tensor_details.layout_details.hopper_scale import unswizzle_mxfp4_scale_hopper
 from triton_kernels.tensor_details.layout_details.hopper_value import mxfp4_to_bf16_triton
@@ -38,6 +32,10 @@ def get_dtype(tensor_or_desc: tl.tensor | tl.tensor_descriptor) -> tl.dtype:
     else:
         raise ValueError(f"Invalid type: {type(tensor_or_desc)}")
 
+@triton.jit
+def load_scale(scale_ptr):
+    return 1.0 if scale_ptr is None else tl.load(scale_ptr)
+
 @triton.jit
 def _load_writeback_idx_and_mask(WriteBackIndx, writeback_size, offs, mask):
     mask = mask & (offs < writeback_size)
@@ -172,7 +170,7 @@ def _p_matmul(
 
     index_type: tl.constexpr = tl.int64
 
-    USE_FLEXPOINT_SCALE: tl.constexpr = YActualScale is not None or YChecksumScale is not None
+    USE_FLEXPOINT_SCALE: tl.constexpr = YExpectedScale is not None or YChecksumScale is not None
     HAS_SCATTER: tl.constexpr = WriteBackIndx is not None
     HAS_GATHER: tl.constexpr = GatherIndx is not None
     USE_GATHER_TMA: tl.constexpr = HAS_GATHER and X_TMA_MODE == "dense"
@@ -200,12 +198,6 @@ def _p_matmul(
     if INDEPENDENT_EPILOGUE:
         tile_id1 = tl.program_id(0) - NUM_SMS
 
-    # Keep track of local max for updating flexpoint scales.
-    USE_LOCAL_ABSMAX: tl.constexpr = (YActualScale is not None) and (not PER_BATCH_OUT_SCALE) and (not is_out_microscaled) and (pYPtrs is None)
-    if USE_LOCAL_ABSMAX:
-        THREADS_PER_BLOCK: tl.constexpr = tl.extra.cuda.num_threads()
-        local_absmax = tl.full([THREADS_PER_BLOCK], 0.0, tl.uint32)
-
     DISALLOW_ACC_MULTI_BUFFER: tl.constexpr = is_w_microscaled and BLOCK_M * BLOCK_N >= 128 * 256
 
     for block_id in tl.range(
@@ -566,23 +558,13 @@ def _p_matmul(
                 YActualScalePtrs = YActualScale + offs_y_mx_k.to(index_type) * stride_y_mx_k + offs_y_mx_z.to(index_type) * stride_y_mx_z + offs_y_mx_m.to(index_type)[:, None] * stride_y_mx_m + offs_y_n_scale.to(index_type)[None, :] * stride_y_mx_n
                 tl.store(YActualScalePtrs, out_scale, mask=mask_m[:, None] & mask_n_scale[None, :])
             else:
-                # Flexpoint
-                if USE_LOCAL_ABSMAX:
-                    out_view = tl.reshape(out, [out.numel // THREADS_PER_BLOCK, THREADS_PER_BLOCK], can_reorder=True)
-                    local_absmax = tl.maximum(local_absmax, nan_propagating_absmax_reduce(out_view, axis=0))
-
+                # Global scale
                 if PER_BATCH_OUT_SCALE:
                     ExpectedScale = YExpectedScale + start_z1
-                    ActualScale = YActualScale + start_z1
                 else:
                     ExpectedScale = YExpectedScale
-                    ActualScale = None  # local absmax is tracked and updated after the loop
-
-                out = float_to_flex(
-                    out, ExpectedScale, ActualScale, YChecksumScale,
-                    None, # mask: out is manually masked to 0
-                    YPtr, FLEXPOINT_SATURATE_INF
-                )
+                if ExpectedScale is not None:
+                    out = out / load_scale(ExpectedScale)
                 if EPILOGUE_FN is not None and not IS_EPILOGUE_QUANT_MXFP8:
                     out = EPILOGUE_FN(out, *epilogue_fn_args, target_dtype=YPtr.dtype.element_ty, pid=len(accs)*tile_id1 + a_i)
 
@@ -636,11 +618,6 @@ def _p_matmul(
                         tl.multiple_of(peer_Y_ptr, [16, 16])
                     tl.store(peer_Y_ptr + offs_kzmn, out, mask=mask)
 
-
-    # Update the flexpoint scales
-    if USE_LOCAL_ABSMAX:
-        tl.atomic_max(YActualScale, compute_scale(local_absmax.to(tl.float32, bitcast=True), YPtr), sem="relaxed")
-
     if pYPtrs is not None:
         all_writes_issued.fn(*all_writes_issued.captured)
 
diff --git a/python/triton_kernels/triton_kernels/numerics_details/flexpoint.py b/python/triton_kernels/triton_kernels/numerics_details/flexpoint.py
diff --git a/python/triton_kernels/triton_kernels/reduce.py b/python/triton_kernels/triton_kernels/reduce.py
diff --git a/python/triton_kernels/triton_kernels/swiglu_details/_swiglu.py b/python/triton_kernels/triton_kernels/swiglu_details/_swiglu.py