triton-lang
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 5 additions & 14 deletions b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 5 additions & 14 deletions
diff --git a/‎python/triton_kernels/tests/test_reduce.py‎
Lines changed: 30 additions & 14 deletions b/‎python/triton_kernels/tests/test_reduce.py‎
Lines changed: 30 additions & 14 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul.py‎
Lines changed: 25 additions & 46 deletions b/‎python/triton_kernels/triton_kernels/matmul.py‎
Lines changed: 25 additions & 46 deletions
diff --git a/‎python/triton_kernels/triton_kernels/numerics.py‎
Lines changed: 0 additions & 44 deletions b/‎python/triton_kernels/triton_kernels/numerics.py‎
Lines changed: 0 additions & 44 deletions
@@ -328,7 +328,7 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
     do_scatter = do_scatter and mode != "batched"
 
     # --- create inputs ---
-    a, a_scales, a_ragged_metadata = make_random_tensor(
+    a, a_ragged_metadata = make_random_tensor(
         shape=(m, k),
         n_slices = n_slices,
         dtype = a_dtype,
@@ -339,8 +339,9 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
         ragged_padding = inner_expt_opt is not None and "pad_a" in inner_expt_opt,
         squeeze_batch_dim = mode == "plain",
         scale_hbm_swizzling = layout.make_default_matmul_mxfp8_act_scale_layout if a_hbm_swizzling else None,
+        scale_global = 1.25 if a_dtype.has_global_scale else None,
     )
-    b, b_scale_tri, b_ragged_metadata = make_random_tensor(
+    b, b_ragged_metadata = make_random_tensor(
         shape=(k, n),
         n_slices = n_slices,
         dtype = b_dtype,
@@ -353,6 +354,7 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
         is_mx_rowmajor = not colmajor_mxfp_weight,
         value_hbm_swizzling = layout.make_default_matmul_mxfp4_w_layout(mx_axis=-2) if b_hbm_swizzling and colmajor_mxfp_weight and b_dtype.is_mxfloat4 else None,
         scale_hbm_swizzling = layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=-2, num_warps=num_warps) if b_hbm_swizzling and colmajor_mxfp_weight and b_dtype.is_mxfloat4 else None,
+        scale_global = 1.25 if b_dtype.has_global_scale else None,
     )
     if not isinstance(a, Tensor):
         a = wrap_torch_tensor(a)
@@ -378,32 +380,21 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
 
     # --- create precision config ---
     wrap_list = lambda vals: torch.tensor(vals, dtype=torch.float32, device=device)
-    a_scale_global = wrap_list([1.25]) if c_dtype.has_global_scale else None
-    b_scale_global = wrap_list([1.25]) if b_dtype.has_global_scale else None
     c_scale_global = wrap_list([4.00]) if c_dtype.has_global_scale else None
     c_absmax = wrap_list([0]) if c_dtype.has_global_scale else None
     precision_opt = PrecisionConfig(
         acc_scale=2.0 if c_dtype.has_global_scale or b_dtype.has_global_scale else 1.0,
         out_dtype=c_dtype.torch_dtype,
     )
-    c_scale_mx = None
 
     # --- create epilogue ---
+    c_scale_mx = None
     epilogue = None
     if c_dtype.has_mx_scale:
         c_scale_shape = c_shape[:-1] + (triton.cdiv(c_shape[-1], MXFP_BLOCK_SIZE),)
         c_scale_mx = torch.empty(c_scale_shape, dtype=torch.uint8, device=a.device)
         epilogue_spec = FnSpecs(FnName.QUANTIZE_MXFP8.name, quantize_mxfp8_fn, (), ())
         epilogue = Epilogue(epilogue_spec, tuple(), tuple(), effective_itemsize=6.0)
-
-    if isinstance(a, Tensor):
-        a = Tensor(a.storage, dtype=a.dtype, shape=a.shape, shape_max=a.shape_max, scale_global=a_scale_global, scale_mx=a_scales)
-    else:
-        a = wrap_torch_tensor(a, scale_global=a_scale_global, scale_mx=a_scales)
-    if isinstance(b, Tensor):
-        b = Tensor(b.storage, dtype=b.dtype, shape=b.shape, shape_max=b.shape_max, scale_global=b_scale_global, scale_mx=b_scale_tri)
-    else:
-        b = wrap_torch_tensor(b, scale_global=b_scale_global, scale_mx=b_scale_tri)
     c = wrap_torch_tensor(c, scale_global=c_scale_global, scale_mx=c_scale_mx)
 
     # --- triton implementation ---
 
@@ -3,7 +3,6 @@
 from triton.testing import do_bench
 from triton_kernels.reduce import reduce, reduce_torch, PostprocessFn, FnSpecs
 from triton_kernels.numerics_details.mxfp import upcast_from_mxfp_torch, downcast_to_mxfp_torch
-from triton_kernels.numerics import InFlexData, OutFlexData
 from triton_kernels.target_info import is_cuda, is_hip, is_hip_cdna3, is_hip_cdna4
 import triton
 import triton.language as tl
@@ -70,24 +69,25 @@ def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn):
     torch.manual_seed(0)
     device = "cuda"
     x = torch.randn((B, M, N), device=device, dtype=torch.float32, requires_grad=True)
-    x_mscale, x_flex = None, None
-    y_flex_tri, y_flex_ref = None, None
+    x_scale_mx, x_scale_global = None, None
+    y_scale_global, y_absmax_tri, y_absmax_ref = None, None, None
     if is_mx := dtype_str.startswith("mx"):
         dtype = dtype_str_to_torch(dtype_str.removeprefix("mx"))
-        x, x_mscale = downcast_to_mxfp_torch(x.to(torch.float16), dtype, axis=-1)
+        x, x_scale_mx = downcast_to_mxfp_torch(x.to(torch.float16), dtype, axis=-1)
     if is_flex := dtype_str.startswith("flex"):
         dtype = dtype_str_to_torch(dtype_str.removeprefix("flex"))
         expected_scale = torch.tensor([4], device=device, dtype=torch.float32)
-        x_flex = InFlexData(scale=torch.tensor([2], device=device, dtype=torch.float32))
-        x = x / x_flex.scale
+        x_scale_global = torch.tensor([2], device=device, dtype=torch.float32)
+        x = x / x_scale_global
         x = x.to(dtype)
-        y_flex_tri = OutFlexData(expected_scale=expected_scale, actual_scale=torch.empty_like(expected_scale))
-        y_flex_ref = OutFlexData(expected_scale=expected_scale, actual_scale=torch.empty_like(expected_scale))
+        y_scale_global = expected_scale
+        y_absmax_tri = torch.zeros_like(expected_scale)
+        y_absmax_ref = torch.zeros_like(expected_scale)
     mask = init_mask(mask_mode, B, M, N, device)
     expected_exception = ValueError if dim == 2 and is_mx else None
     if expected_exception is not None:
         with pytest.raises(expected_exception):
-            reduce(x, dim=dim, mask=mask, x_mxscale=x_mscale)
+            reduce(x, dim=dim, mask=mask, x_scale_mx=x_scale_mx)
         return
     if postprocess_fn == "plus_ten":
         postprocess_fn_tri = PostprocessFn(specs=FnSpecs("plus_a", plus_a_reduce, ("a", ), reduction_n=2),
@@ -98,16 +98,32 @@ def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn):
     # run forward pass
     x_tri = x.clone().detach().requires_grad_(True)
     x_ref = x.clone().detach().requires_grad_(True)
-    y_tri, y_tri_mxscale = reduce(x_tri, dim=dim, mask=mask, x_mxscale=x_mscale, x_flex=x_flex, y_flex=y_flex_tri,
-                                  postprocess_fn1=postprocess_fn_tri)
-    y_ref, y_ref_mxscale = reduce_torch(x_ref, dim=dim, mask=mask, x_mxscale=x_mscale, x_flex=x_flex, y_flex=y_flex_ref,
-                                        postprocess_fn1=postprocess_fn_ref)
+    y_tri, y_tri_mxscale = reduce(
+        x_tri,
+        dim=dim,
+        mask=mask,
+        x_scale_mx=x_scale_mx,
+        x_scale_global=x_scale_global,
+        y_scale_global=y_scale_global,
+        y_absmax=y_absmax_tri,
+        postprocess_fn1=postprocess_fn_tri,
+    )
+    y_ref, y_ref_mxscale = reduce_torch(
+        x_ref,
+        dim=dim,
+        mask=mask,
+        x_scale_mx=x_scale_mx,
+        x_scale_global=x_scale_global,
+        y_scale_global=y_scale_global,
+        y_absmax=y_absmax_ref,
+        postprocess_fn1=postprocess_fn_ref,
+    )
     if is_mx:
         y_ref = upcast_from_mxfp_torch(y_ref, y_ref_mxscale, torch.float16, axis=-1)
         y_tri = upcast_from_mxfp_torch(y_tri, y_tri_mxscale, torch.float16, axis=-1)
     assert torch.allclose(y_tri.float(), y_ref.float(), atol=1e-3, rtol=1e-3)
     if is_flex:
-        torch.allclose(y_flex_tri.actual_scale, y_flex_ref.actual_scale, atol=1e-3, rtol=1e-3)
+        assert torch.allclose(y_absmax_tri, y_absmax_ref, atol=1e-3, rtol=1e-3)
     run_bwd = postprocess_fn is None and "float8" not in dtype_str
     if run_bwd:
         dy = torch.randn_like(y_tri)
 
@@ -7,7 +7,6 @@
 from enum import Enum, auto
 import math
 from typing import Callable
-from types import SimpleNamespace
 # utilities
 from triton_kernels import target_info
 from triton_kernels.meta import Closure
@@ -260,36 +259,25 @@ def matmul(a, b, bias,
     if epilogue is None:
         epilogue = Epilogue(FnSpecs.default(), tuple(), tuple(), False)
     n_slices = max(1, b.shape[0]) if a_ragged_metadata is None else a_ragged_metadata.n_slices
-    if c is not None and not isinstance(c, Tensor):
-        c = wrap_torch_tensor(c)
-    if d is not None and not isinstance(d, Tensor):
-        d = wrap_torch_tensor(d)
-    c_data = None if c is None else c.storage.data
-    d_data = None if d is None else d.storage.data
     if not isinstance(a, Tensor):
         a = wrap_torch_tensor(a)
     if not isinstance(b, Tensor):
         b_dtype = FP4 if b.dtype == torch.uint8 else None
         b = wrap_torch_tensor(b, dtype=b_dtype)
-    a_scale_global = a.scale_global
+    if c is not None and not isinstance(c, Tensor):
+        c = wrap_torch_tensor(c)
+    if d is not None and not isinstance(d, Tensor):
+        d = wrap_torch_tensor(d)
+    d_data = None if d is None else d.storage.data
     a_scale = a.scale_mx
-    if isinstance(a_scale, torch.Tensor):
-        a_scale = wrap_torch_tensor(a_scale)
-    b_scale_global = b.scale_global
     b_scale = b.scale_mx
-    if isinstance(b_scale, torch.Tensor):
-        b_scale = wrap_torch_tensor(b_scale)
     b_has_mx = b_scale is not None
     if b_has_mx and (torch.cuda.get_device_capability()[0] < 10 or b.storage.layout is not None and not isinstance(b.storage.layout, StridedLayout)):
         assert b.stride(-2) == 1, "`w` must be column-major when it has data-type mxfp and (swizzled or not on >=Blackwell)"
-    if b_scale is not None:
-        b_scale.storage.data = b_scale.data.view(torch.uint8)
     is_hopper_fp8 = is_cuda() and not target_info.cuda_capability_geq(10, 0) and b.dtype.bitwidth == 8
     if is_hopper_fp8: assert b.stride(-2) == 1, "`w` must be column-major when it has data-type FP8 on capability < 10"
     c_scale_global = None if c is None else c.scale_global
     c_scale_mx = None if c is None else c.scale_mx
-    if isinstance(c_scale_mx, torch.Tensor):
-        c_scale_mx = wrap_torch_tensor(c_scale_mx)
     d_scale_global = None if d is None else d.scale_global
 
     # unpack a scale
@@ -310,8 +298,8 @@ def matmul(a, b, bias,
         batch_size = b.shape[0]
     else:
         batch_size = 1
-    if d_data is not None:
-        d_is_c = c_data is not None and d_data.data_ptr() == c_data.data_ptr() and d_data.stride() == c_data.stride()
+    if d_data is not None and c is not None:
+        d_is_c = d_data.data_ptr() == c.storage.data.data_ptr() and d_data.stride() == c.storage.data.stride()
     else:
         d_is_c = None
     K = a.shape[-1]
@@ -327,8 +315,8 @@ def matmul(a, b, bias,
         (b_scale is None or is_tma_compliant(b_scale)) and
         (ragged_dimension != "M" or a.stride(-1) == 1) and
         # Currently we don't support tma if y is column major; may revisit later if this becomes an issue.
-        (c_data is None or c_data.stride(-1) == 1) and
-        (d_data is None or d_is_c) and
+        (c is None or c.storage.data.stride(-1) == 1) and
+        (d is None or d_is_c) and
         # if ragged dimension is K, w must be either padded or row major to ensure alignment
         (ragged_dimension != "K" or b.stride(-1) == 1 or b_ragged_metadata.slice_sizes_divisibility is not None)
     )
@@ -382,7 +370,7 @@ def matmul(a, b, bias,
                                  gather_indx, scatter_indx, batch_size,
                                  fused_comm.n_reduce_shards if fused_comm is not None else 1,
                                  opt_flags)
-    memory = apply_allocation(allocation, c_data)
+    memory = apply_allocation(allocation, None if c is None else c.storage.data)
     # early exit
     if batch_size * M * N == 0:
         ret = memory["output"].squeeze(0)
@@ -420,10 +408,10 @@ def matmul(a, b, bias,
     # canonicalize storage
     has_scatter_tma = scatter_indx is not None and target_info.has_tma_gather()
     c = wrap_torch_tensor(out_matmul.view(math.prod(out_matmul.shape[:-1]), out_matmul.shape[-1]) if has_scatter else out_matmul.view(math.prod(out_matmul.shape[:-2]), *out_matmul.shape[-2:]))
-    a = Tensor(_canonicalize_storage(a.storage, 2 if has_gather_tma else 3), dtype=a.dtype, shape=a.shape, shape_max=a.shape_max)
-    b = Tensor(_canonicalize_storage(b.storage, 3), dtype=b.dtype, shape=b.shape, shape_max=b.shape_max)
-    c = Tensor(_canonicalize_storage(c.storage, 2 if has_scatter_tma else 3), dtype=c.dtype, shape=c.shape, shape_max=c.shape_max)
-    # create tma descriptor for x
+    a = Tensor(_canonicalize_storage(a.storage, 2 if has_gather_tma else 3), dtype=a.dtype, shape=a.shape, shape_max=a.shape_max, scale_global=a.scale_global, scale_mx=a.scale_mx)
+    b = Tensor(_canonicalize_storage(b.storage, 3), dtype=b.dtype, shape=b.shape, shape_max=b.shape_max, scale_global=b.scale_global, scale_mx=b.scale_mx)
+    c = Tensor(_canonicalize_storage(c.storage, 2 if has_scatter_tma else 3), dtype=c.dtype, shape=c.shape, shape_max=c.shape_max, scale_global=c.scale_global, scale_mx=c.scale_mx)
+    # create tma descriptor for d
     if d_data is not None:
         assert opt_flags.split_k == 1, "d + split_k is not supported."
         assert scatter_indx is None, "d + scatter is not supported."
@@ -511,10 +499,10 @@ def matmul(a, b, bias,
                    *((None, out_matmul_scale, None) if out_matmul_has_mx else (out_global_scale, out_absmax, None)),
                    *out_matmul_scale_strides[-4:],
                    a_tensor_or_tma, a.storage.data, *a_strides, a_transpose,
-                   a_scale_global,
+                   a.scale_global,
                    a_scale_tensor_or_tma, *a_scale_strides,
                    b_tensor_or_tma, b.storage.data, *b.storage.data.stride(), b_transpose,
-                   b_scale_global,
+                   b.scale_global,
                    b_scale_tensor_or_tma, *b_scale_strides,
                    d_data, *d_strides,
                    d_scale_global, d_is_c,
@@ -536,7 +524,7 @@ def matmul(a, b, bias,
                    precision_config.max_num_imprecise_acc,
                    precision_config.allow_tf32,
                    precision_config.flexpoint_saturate_inf,
-                   _is_per_batch_scale(b_scale_global),
+                   _is_per_batch_scale(b.scale_global),
                    _is_per_batch_scale(out_global_scale),
                    _is_per_batch_scale(d_scale_global),
                    opt_flags.block_m,
@@ -569,33 +557,24 @@ def matmul(a, b, bias,
         assert not out_matmul_has_mx
         postprocess_fn1 = ReducePostprocessFn(specs=reduce_fused_activation.specs, fn_args=reduce_fused_activation.fn_args)
         postprocess_fn2 = ReducePostprocessFn(specs=epilogue.specs, fn_args=epilogue.fn_arg_values_finalize)
-        reduce_y_flex = None
-        if c_scale_global is not None or c_absmax is not None:
-            reduce_y_flex = SimpleNamespace(
-                expected_scale=c_scale_global,
-                actual_scale=c_absmax,
-                checksum_scale=None,
-                is_per_batch=_is_per_batch_scale(c_scale_global),
-                reinterpret=lambda x: x,
-            )
-        c, y_mx_scale = reduce(
+        c, c_mx_scale = reduce(
             x = out_matmul.view(out_matmul.shape[0], -1, out_matmul.shape[-1]),
             dim = 0,
             # output data/metadata
             y = memory["output"].view(-1, memory["output"].shape[-1]),
             y_dtype = memory["output"].dtype,
-            x_flex = None,
-            y_flex = reduce_y_flex,
-            y_flex_saturate_inf = precision_config.flexpoint_saturate_inf,
+            y_scale_global = c_scale_global,
+            y_absmax = c_absmax,
+            y_saturate_inf = precision_config.flexpoint_saturate_inf,
             y_has_mx = c_scale_mx is not None,
             # fused functions
             postprocess_fn1 = postprocess_fn1,
             postprocess_fn2 = postprocess_fn2,
         )
         y_shape = out_matmul.shape[1:-1] + (out_matmul.shape[-1] // reduce_fused_activation.specs.reduction_n,)
         out_final = c.view(*y_shape)
-        if y_mx_scale is not None:
-            out_final_mx_scale = y_mx_scale.view(out_matmul.shape[-2], triton.cdiv(out_matmul.shape[-1], 32))
+        if c_mx_scale is not None:
+            out_final_mx_scale = c_mx_scale.view(out_matmul.shape[-2], triton.cdiv(out_matmul.shape[-1], 32))
     else:
         out_final = out_matmul.squeeze(0)
         out_final_mx_scale = out_matmul_scale
@@ -627,7 +606,7 @@ def apply(x, scale):
         return x.float() * scale
 
     if x_tri.scale_mx is not None:
-        a_scale = x_tri.scale_mx if isinstance(x_tri.scale_mx, Tensor) else wrap_torch_tensor(x_tri.scale_mx)
+        a_scale = x_tri.scale_mx
         mx_axis = x_tri.storage.data.ndim -1
         canonical_layout = layout.StridedLayout(major_dim=mx_axis)
         x_tri = convert_layout(x_tri, canonical_layout)
@@ -637,7 +616,7 @@ def apply(x, scale):
         x_ref = apply(x_tri.storage.data, x_tri.scale_global)
 
     if w_tri.scale_mx is not None:
-        b_scale = w_tri.scale_mx if isinstance(w_tri.scale_mx, Tensor) else wrap_torch_tensor(w_tri.scale_mx)
+        b_scale = w_tri.scale_mx
         mx_axis = w_tri.storage.data.ndim - 2
         canonical_layout = layout.StridedLayout(major_dim=mx_axis)
         w_tri = convert_layout(w_tri, canonical_layout)
 
@@ -1,51 +1,7 @@
-import torch
-from dataclasses import dataclass
-
 # ------ global scaling -------
 
 MAX_FINITE_FLOAT8E5 = 57344.0
 MAX_FINITE_FLOAT8E4NV = 448.0
 MAX_FINITE_FLOAT8E4B8 = 240.0
 
-
-@dataclass(frozen=True)
-class BaseFlexData:
-    dtype: torch.dtype | None = None
-
-    def view(self, x: torch.Tensor):
-        if self.dtype is None:
-            return x
-        return x.view(self.dtype)
-
-    def reinterpret(self, x):
-        if self.dtype is None or x.dtype.itemsize > 1:
-            return x
-        return x.view(self.dtype)
-
-
-@dataclass(frozen=True)
-class InFlexData(BaseFlexData):
-    scale: torch.Tensor | None = None
-
-    @property
-    def is_per_batch(self):
-        return False if self.scale is None else len(self.scale) > 1
-
-
-@dataclass(frozen=True)
-class OutFlexData(BaseFlexData):
-    expected_scale: torch.Tensor | None = None
-    actual_scale: torch.Tensor | None = None
-    checksum_scale: torch.Tensor | None = None
-
-    @property
-    def is_per_batch(self):
-        return False if self.expected_scale is None else len(self.expected_scale) > 1
-
-    def __iter__(self):
-        yield self.expected_scale
-        yield self.actual_scale
-        yield self.checksum_scale
-
-
 # ------ block scaling -------