.

ptillet · ptillet · commit af2196d86f33 · 2026-02-06T22:46:12.000-08:00
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -15,7 +15,7 @@
 from triton_kernels.tensor import make_ragged_tensor_metadata, remap_ragged_tensor_metadata  # ragged tensor
 from triton_kernels.distributed import convert_dp_to_ep, convert_ep_to_dp, make_expt_dict_uniform, make_expt_assignment, SymmetricMemoryPool
 # quantization
-from triton_kernels.tensor import convert_layout, wrap_torch_tensor, FP4
+from triton_kernels.tensor import convert_layout, wrap_torch_tensor, FP4, Tensor
 from triton_kernels.numerics_details.mxfp import downcast_to_mxfp
 
 
@@ -39,17 +39,16 @@ def quantize_weight(w, dtype, **opt):
         fp8e4_dtype = torch.float8_e4m3fn if get_cdna_version() != 3 else torch.float8_e4m3fnuz
         wq = w.to(fp8e4_dtype)
         wq = wq.transpose(-1, -2).contiguous().transpose(-1, -2)
-        wq = wrap_torch_tensor(wq)
-        wq.scale_global = w.abs().max().unsqueeze(0)
-        return wq
+        return wrap_torch_tensor(wq, scale_global=w.abs().max().unsqueeze(0))
     else:
         assert dtype == FP4, f"{dtype=}"
         w, w_scale = downcast_to_mxfp(w.to(torch.bfloat16), torch.uint8, axis=1)
         if opt:
             w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"], **opt["value_layout_opts"])
             w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"], **opt["scale_layout_opts"])
-        w.scale_mx = w_scale
-        return w
+        if isinstance(w, Tensor):
+            return Tensor(w.storage, dtype=w.dtype, shape=w.shape, shape_max=w.shape_max, scale_mx=w_scale)
+        return wrap_torch_tensor(w, dtype=FP4, scale_mx=w_scale)
 
 
 def run_mlp(x_dp_local_bf16, x_dp_local_fp8,  # activations
diff --git a/python/triton_kernels/bench/bench_utils.py b/python/triton_kernels/bench/bench_utils.py
@@ -20,17 +20,16 @@ def _quantize_weight(w, dtype, **opt):
         wq = w.to(fp8e4_dtype)
         if is_cuda() and not cuda_capability_geq(10, 0):
             wq = wq.transpose(-1, -2).contiguous().transpose(-1, -2)
-        wq = wrap_torch_tensor(wq)
-        wq.scale_global = w.abs().max().unsqueeze(0)
-        return wq
+        return wrap_torch_tensor(wq, scale_global=w.abs().max().unsqueeze(0))
     else:
         assert dtype == "mx4", f"{dtype=}"
         w, w_scale = downcast_to_mxfp(w.to(torch.bfloat16), torch.uint8, axis=1)
         if opt:
             w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"], **opt["value_layout_opts"])
             w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"], **opt["scale_layout_opts"])
-        w.scale_mx = w_scale
-        return w
+        if isinstance(w, Tensor):
+            return Tensor(w.storage, dtype=w.dtype, shape=w.shape, shape_max=w.shape_max, scale_mx=w_scale)
+        return wrap_torch_tensor(w, dtype=FP4, scale_mx=w_scale)
 
 
 @dataclass
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -375,7 +375,6 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
     c = torch.empty(c_shape, dtype=c_dtype.torch_dtype, device=device)
     if c_transpose:
         c = c.mT.contiguous().mT
-    c = wrap_torch_tensor(c)
 
     # --- create precision config ---
     wrap_list = lambda vals: torch.tensor(vals, dtype=torch.float32, device=device)
@@ -387,48 +386,48 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
         acc_scale=2.0 if c_dtype.has_global_scale or b_dtype.has_global_scale else 1.0,
         out_dtype=c_dtype.torch_dtype,
     )
-    a_scale_mx = a_scales
-    b_scale_mx = b_scale_tri
     c_scale_mx = None
-    a.scale_global = a_scale_global
-    b.scale_global = b_scale_global
-    c.scale_global = c_scale_global
-    c.scale_actual = c_absmax
-    a.scale_mx = a_scale_mx
-    b.scale_mx = b_scale_mx
-    c.scale_mx = c_scale_mx
 
     # --- create epilogue ---
     epilogue = None
     if c_dtype.has_mx_scale:
         c_scale_shape = c_shape[:-1] + (triton.cdiv(c_shape[-1], MXFP_BLOCK_SIZE),)
         c_scale_mx = torch.empty(c_scale_shape, dtype=torch.uint8, device=a.device)
-        c.scale_mx = c_scale_mx
         epilogue_spec = FnSpecs(FnName.QUANTIZE_MXFP8.name, quantize_mxfp8_fn, (), ())
         epilogue = Epilogue(epilogue_spec, tuple(), tuple(), effective_itemsize=6.0)
 
+    if isinstance(a, Tensor):
+        a = Tensor(a.storage, dtype=a.dtype, shape=a.shape, shape_max=a.shape_max, scale_global=a_scale_global, scale_mx=a_scales)
+    else:
+        a = wrap_torch_tensor(a, scale_global=a_scale_global, scale_mx=a_scales)
+    if isinstance(b, Tensor):
+        b = Tensor(b.storage, dtype=b.dtype, shape=b.shape, shape_max=b.shape_max, scale_global=b_scale_global, scale_mx=b_scale_tri)
+    else:
+        b = wrap_torch_tensor(b, scale_global=b_scale_global, scale_mx=b_scale_tri)
+    c = wrap_torch_tensor(c, scale_global=c_scale_global, scale_mx=c_scale_mx)
 
     # --- triton implementation ---
     try:
         tri_y = matmul(a, b, bias,
                            a_ragged_metadata, b_ragged_metadata,
                            gather_indx, scatter_indx, precision_opt,
-                           gammas=gammas, epilogue=epilogue, c=c,
+                           gammas=gammas, epilogue=epilogue, c=c, c_absmax=c_absmax,
                            fused_activation=fused_activation)
         if c_dtype.has_global_scale:
-            tri_y_scale = c.scale_actual.clone()
+            tri_y_scale = c_absmax.clone()
     except (opt_flags.InapplicableConstraint, NotImplementedError) as e:
         pytest.skip(f"inapplicable opt_flags constraint {e}")
     # --- torch implementation ---
     ref_y = matmul_torch(a, b, bias,  #
                         a_ragged_metadata, b_ragged_metadata,
                         gather_indx, scatter_indx, precision_opt,
                         gammas=gammas,
-                        c=c)
+                        c=c,
+                        c_absmax=c_absmax)
     if swiglu_opts is not None:
         ref_y = swiglu(ref_y, alpha=swiglu_opts[0], precision_config=SwiGLUPrecisionConfig(swiglu_opts[1]))
     if c_dtype.has_global_scale:
-        ref_y_scale = c.scale_actual.clone()
+        ref_y_scale = c_absmax.clone()
 
     # --- check results ---
     if c_dtype.has_mx_scale:
diff --git a/python/triton_kernels/triton_kernels/matmul.py b/python/triton_kernels/triton_kernels/matmul.py
@@ -224,6 +224,7 @@ def matmul(a, b, bias,
     gammas: torch.Tensor | None = None,
     out_alpha: float | None = None,
     c: torch.Tensor | Tensor | None = None,
+    c_absmax: torch.Tensor | None = None,
     fused_comm: FusedComm | None = None,
     fused_activation: FusedActivation | None = None,
     epilogue: Epilogue | None = None,
@@ -259,32 +260,37 @@ def matmul(a, b, bias,
     if epilogue is None:
         epilogue = Epilogue(FnSpecs.default(), tuple(), tuple(), False)
     n_slices = max(1, b.shape[0]) if a_ragged_metadata is None else a_ragged_metadata.n_slices
-    c_data = c.storage.data if isinstance(c, Tensor) else c
-    d_data = d.storage.data if isinstance(d, Tensor) else d
+    if c is not None and not isinstance(c, Tensor):
+        c = wrap_torch_tensor(c)
+    if d is not None and not isinstance(d, Tensor):
+        d = wrap_torch_tensor(d)
+    c_data = None if c is None else c.storage.data
+    d_data = None if d is None else d.storage.data
     if not isinstance(a, Tensor):
         a = wrap_torch_tensor(a)
     if not isinstance(b, Tensor):
-        dtype = FP4 if b.dtype == torch.uint8 else None
-        b = wrap_torch_tensor(b, dtype=dtype)
+        b_dtype = FP4 if b.dtype == torch.uint8 else None
+        b = wrap_torch_tensor(b, dtype=b_dtype)
     a_scale_global = a.scale_global
     a_scale = a.scale_mx
-    if a_scale is not None and not isinstance(a_scale, Tensor):
+    if isinstance(a_scale, torch.Tensor):
         a_scale = wrap_torch_tensor(a_scale)
     b_scale_global = b.scale_global
     b_scale = b.scale_mx
+    if isinstance(b_scale, torch.Tensor):
+        b_scale = wrap_torch_tensor(b_scale)
     b_has_mx = b_scale is not None
     if b_has_mx and (torch.cuda.get_device_capability()[0] < 10 or b.storage.layout is not None and not isinstance(b.storage.layout, StridedLayout)):
         assert b.stride(-2) == 1, "`w` must be column-major when it has data-type mxfp and (swizzled or not on >=Blackwell)"
-    if b_scale is not None and not isinstance(b_scale, Tensor):
-        b_scale = wrap_torch_tensor(b_scale)
     if b_scale is not None:
         b_scale.storage.data = b_scale.data.view(torch.uint8)
     is_hopper_fp8 = is_cuda() and not target_info.cuda_capability_geq(10, 0) and b.dtype.bitwidth == 8
     if is_hopper_fp8: assert b.stride(-2) == 1, "`w` must be column-major when it has data-type FP8 on capability < 10"
-    c_scale_global = None if not isinstance(c, Tensor) else c.scale_global
-    c_absmax = None if not isinstance(c, Tensor) else c.scale_actual
-    c_scale_mx = None if not isinstance(c, Tensor) else c.scale_mx
-    d_scale_global = None if not isinstance(d, Tensor) else d.scale_global
+    c_scale_global = None if c is None else c.scale_global
+    c_scale_mx = None if c is None else c.scale_mx
+    if isinstance(c_scale_mx, torch.Tensor):
+        c_scale_mx = wrap_torch_tensor(c_scale_mx)
+    d_scale_global = None if d is None else d.scale_global
 
     # unpack a scale
     a_has_mx = a_scale is not None
@@ -597,7 +603,7 @@ def matmul(a, b, bias,
     if not (is_input_batched or b_ragged_metadata is not None):
         out_final = out_final.squeeze(0)
     if out_final_mx_scale is not None and c_scale_mx is not None:
-        c_scale_mx_torch = c_scale_mx.storage.data if isinstance(c_scale_mx, Tensor) else c_scale_mx
+        c_scale_mx_torch = c_scale_mx.storage.data
         if out_final_mx_scale.data_ptr() != c_scale_mx_torch.data_ptr():
             c_scale_mx_torch.copy_(out_final_mx_scale)
     return out_final
@@ -675,14 +681,17 @@ def matmul_torch(a, b, bias,
                  gammas = None,
                  round_x = None, round_y = None,
                  c: torch.Tensor | Tensor | None = None,
+                 c_absmax: torch.Tensor | None = None,
                  ):
     if precision_config is None:
         precision_config = PrecisionConfig()
+    if c is not None and not isinstance(c, Tensor):
+        c = wrap_torch_tensor(c)
     if not isinstance(a, Tensor):
         a = wrap_torch_tensor(a)
     if not isinstance(b, Tensor):
-        dtype = FP4 if b.dtype == torch.uint8 else None
-        b = wrap_torch_tensor(b, dtype=dtype)
+        b_dtype = FP4 if b.dtype == torch.uint8 else None
+        b = wrap_torch_tensor(b, dtype=b_dtype)
     a, b = apply_precision(a, b, precision_config)
 
     if b_ragged_metadata is not None:
@@ -708,9 +717,9 @@ def matmul_torch(a, b, bias,
                 round_y=round_y,
             )
             out[expt] = out_expt.to(out.dtype)
-        if isinstance(c, Tensor) and c.scale_actual is not None:
-            c.scale_actual.copy_(compute_actual_scale(out, precision_config.out_dtype))
-        return scale(out, c.scale_global if isinstance(c, Tensor) else None)
+        if c_absmax is not None:
+            c_absmax.copy_(compute_actual_scale(out, precision_config.out_dtype))
+        return scale(out, None if c is None else c.scale_global)
 
     is_input_batched = a.ndim == 3
     assert a.dtype.itemsize > 1
@@ -760,9 +769,9 @@ def matmul_torch(a, b, bias,
         out = torch.zeros((scatter_indx.shape[0], y.shape[-1]), dtype=y.dtype, device=a.device)
         msk = scatter_indx != -1
         out[scatter_indx[msk], :] = y[msk, :]
-    if isinstance(c, Tensor) and c.scale_actual is not None:
-        c.scale_actual.copy_(compute_actual_scale(out, precision_config.out_dtype))
-    return scale(out, c.scale_global if isinstance(c, Tensor) else None)
+    if c_absmax is not None:
+        c_absmax.copy_(compute_actual_scale(out, precision_config.out_dtype))
+    return scale(out, None if c is None else c.scale_global)
 
 
 def post_matmul_comm_torch(y: torch.Tensor, rank: int, n_reduce_shards: int,
diff --git a/python/triton_kernels/triton_kernels/tensor.py b/python/triton_kernels/triton_kernels/tensor.py
@@ -36,7 +36,6 @@ class Tensor:
     shape: list[int] | None = None
     shape_max: list[int] | None = None
     scale_global: torch.Tensor | None = None
-    scale_actual: torch.Tensor | None = None
     scale_mx: object | None = None
 
     def __post_init__(self):
@@ -209,7 +208,11 @@ def __post_init__(self):
 # ---------------------------------------------------------------------------- #
 
 
-def wrap_torch_tensor(torch_tensor, dtype=None, shape=None, shape_max=None, layout=None):
+def wrap_torch_tensor(torch_tensor, dtype=None, shape=None, shape_max=None, layout=None, scale_global=None,
+                      scale_mx=None):
+    assert isinstance(torch_tensor, torch.Tensor), f"`wrap_torch_tensor` expects torch.Tensor, got {type(torch_tensor)}"
+    if isinstance(scale_mx, torch.Tensor):
+        scale_mx = wrap_torch_tensor(scale_mx)
     if dtype is None:
         dtype = torch_tensor.dtype
     dtype = torch_dtype_to_dtype(dtype)
@@ -229,9 +232,8 @@ def wrap_torch_tensor(torch_tensor, dtype=None, shape=None, shape_max=None, layo
         dtype=dtype,
         shape=shape,
         shape_max=shape_max,
-        scale_global=None,
-        scale_actual=None,
-        scale_mx=None,
+        scale_global=scale_global,
+        scale_mx=scale_mx,
     )