triton-lang
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 11 additions & 13 deletions b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎python/triton_kernels/tests/test_matmul_details/test_opt_flags_split_k.py‎
Lines changed: 4 additions & 4 deletions b/‎python/triton_kernels/tests/test_matmul_details/test_opt_flags_split_k.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/triton_kernels/tests/test_mxfp.py‎
Lines changed: 11 additions & 6 deletions b/‎python/triton_kernels/tests/test_mxfp.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎python/triton_kernels/tests/test_tensor.py‎
Lines changed: 3 additions & 2 deletions b/‎python/triton_kernels/tests/test_tensor.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎python/triton_kernels/tests/test_tensor_details/test_layout_blackwell.py‎
Lines changed: 9 additions & 6 deletions b/‎python/triton_kernels/tests/test_tensor_details/test_layout_blackwell.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎python/triton_kernels/tests/test_tensor_details/test_layout_cdna4.py‎
Lines changed: 3 additions & 2 deletions b/‎python/triton_kernels/tests/test_tensor_details/test_layout_cdna4.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎python/triton_kernels/tests/test_tensor_details/test_layout_hopper.py‎
Lines changed: 13 additions & 9 deletions b/‎python/triton_kernels/tests/test_tensor_details/test_layout_hopper.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎python/triton_kernels/triton_kernels/compaction.py‎
Lines changed: 2 additions & 2 deletions b/‎python/triton_kernels/triton_kernels/compaction.py‎
Lines changed: 2 additions & 2 deletions
@@ -21,6 +21,8 @@
 from triton_kernels.swiglu import swiglu, swiglu_fn
 from triton_kernels.swiglu import PrecisionConfig as SwiGLUPrecisionConfig
 from triton_kernels.tensor_details import layout
+from triton_kernels.tensor_details.dtype import FP32
+
 # ---------------
 # numerics stuff
 # ---------------
@@ -134,9 +136,9 @@ def _build_test_op_cases():
         Case(1024, 1024, 1024, "batched", "float8_e5m2", "mxfloat4_e2m1"),
         Case(1024, 1024, 1024, "ragged", "float8_e5m2", "mxfloat4_e2m1", split_k=9),
         Case(1024, 1024, 1024, "ragged", "float8_e5m2", "mxfloat4_e2m1", split_k=9, b_hbm_swizzling=True),
-        Case(300, 400, 400, "ragged", "float8_e5m2", "mxfloat8_e4m3fn"),
+        Case(300, 400, 416, "ragged", "float8_e5m2", "mxfloat8_e4m3fn"),
         Case(300, 400, 832, "ragged", "float8_e5m2", "mxfloat4_e2m1"),
-        Case(300, 400, 400, "batched", "float8_e5m2", "mxfloat8_e4m3fn"),
+        Case(300, 400, 416, "batched", "float8_e5m2", "mxfloat8_e4m3fn"),
     ])
     # mxfloat x mxfloat
     test_cases.extend([
@@ -145,11 +147,11 @@ def _build_test_op_cases():
         Case(1024, 1024, 1024, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", split_k=9, colmajor_mxfp_weight=False),
         Case(1000, 704, 800, "batched", "mxfloat8_e4m3fn", "mxfloat4_e2m1", b_hbm_swizzling=True, a_hbm_swizzling=True),
         Case(1000, 704, 800, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", b_hbm_swizzling=True, a_hbm_swizzling=True),
-        Case(300, 400, 400, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", b_hbm_swizzling=True, a_hbm_swizzling=True),
+        Case(300, 400, 416, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", b_hbm_swizzling=True, a_hbm_swizzling=True),
         Case(256, 1024, 512, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", b_hbm_swizzling=True, a_hbm_swizzling=True),
-        Case(300, 400, 400, "ragged", "mxfloat8_e4m3fn", "mxfloat8_e4m3fn"),
-        Case(300, 400, 400, "ragged", "mxfloat8_e4m3fn", "mxfloat8_e4m3fn", b_hbm_swizzling=True),
-        Case(300, 400, 400, "batched", "mxfloat8_e4m3fn", "mxfloat8_e4m3fn"),
+        Case(300, 400, 416, "ragged", "mxfloat8_e4m3fn", "mxfloat8_e4m3fn"),
+        Case(300, 400, 416, "ragged", "mxfloat8_e4m3fn", "mxfloat8_e4m3fn", b_hbm_swizzling=True),
+        Case(300, 400, 416, "batched", "mxfloat8_e4m3fn", "mxfloat8_e4m3fn"),
         Case(1024, 1024, 1024, "batched", "mxfloat8_e4m3fn", "mxfloat4_e2m1", b_hbm_swizzling=True),
     ])
     # amd-specific float8
@@ -340,9 +342,7 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
         ragged_padding = inner_expt_opt is not None and "pad_a" in inner_expt_opt,
         squeeze_batch_dim = mode == "plain",
         scale_hbm_swizzling = layout.make_default_matmul_mxfp8_act_scale_layout if a_hbm_swizzling else None,
-        scale_hbm_swizzling_args = {"ragged_metadata": None}, # ragged_metadata will be set in the make_random_tensor function
     )
-
     b, b_scale_tri, b_ragged_metadata = make_random_tensor(
         shape=(k, n),
         n_slices = n_slices,
@@ -354,10 +354,8 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
         ragged_padding = inner_expt_opt is not None and "pad_b" in inner_expt_opt,
         squeeze_batch_dim = mode == "plain",
         is_mx_rowmajor = not colmajor_mxfp_weight,
-        value_hbm_swizzling = layout.make_default_matmul_mxfp4_w_layout if b_hbm_swizzling and colmajor_mxfp_weight and b_dtype.is_mxfloat4 else None,
-        value_hbm_swizzling_args = {"mx_axis":-2},
-        scale_hbm_swizzling = layout.make_default_matmul_mxfp4_w_scale_layout if b_hbm_swizzling and colmajor_mxfp_weight and b_dtype.is_mxfloat4 else None,
-        scale_hbm_swizzling_args = dict(mx_axis=-2, num_warps=num_warps),
+        value_hbm_swizzling = layout.make_default_matmul_mxfp4_w_layout(mx_axis=-2) if b_hbm_swizzling and colmajor_mxfp_weight and b_dtype.is_mxfloat4 else None,
+        scale_hbm_swizzling = layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=-2, num_warps=num_warps) if b_hbm_swizzling and colmajor_mxfp_weight and b_dtype.is_mxfloat4 else None,
     )
     gather_indx  = None if not do_gather  else torch.randint(0, max(m, 1), (m, ), dtype=torch.int32, device=device)
     scatter_indx = None if not do_scatter else torch.randperm(m, dtype=torch.int32, device=device)
@@ -442,6 +440,6 @@ def test_set_idle_sms():
     from triton_kernels.matmul_details.opt_flags import make_opt_flags
     num_idle_sms = 24
     matmul_set_idle_sms(num_idle_sms)
-    flags = make_opt_flags(torch.float32, torch.float32, torch.float32, PrecisionConfig(), \
+    flags = make_opt_flags(FP32, FP32, FP32, PrecisionConfig(), \
                            1, 1024, 1024, 1024, None, True, False, 1, False, False, None)
     assert flags.idle_sms == num_idle_sms
@@ -6,7 +6,7 @@
 import torch
 
 import triton_kernels.matmul_details.opt_flags as opt_flags
-
+from triton_kernels.tensor_details.dtype import FP16
 
 class _DummyPrecisionConfig:
     def __init__(self):
@@ -84,9 +84,9 @@ def test_make_default_opt_flags_amd_split_k_constraint(monkeypatch):
 
     precision_config = _DummyPrecisionConfig()
     flags = opt_flags.make_default_opt_flags_amd(
-        torch.float16,
-        torch.float16,
-        torch.float16,
+        FP16,
+        FP16,
+        FP16,
         precision_config,
         2,
         128,
 
@@ -29,9 +29,10 @@ def test_mxfp4_rounding_cases(dst_dtype, device):
         torch.float16: 0.250244140625,
         torch.float32: 0.2500000298023223877,
     }[dst_dtype]
+    pad_values = [0] * 22
     # Construct an example where scale is 1 (when max value is 6.0, the maximum value of e2m1)
-    x = torch.tensor([6, 0, 0.24, 0.25, 0.75, 0.99, 1.2, 1.3, -1.25, two_point_five_plus_ulp], dtype=dst_dtype,
-                     device=device).view(1, -1, 1)
+    x = torch.tensor([6, 0, 0.24, 0.25, 0.75, 0.99, 1.2, 1.3, -1.25, two_point_five_plus_ulp] + pad_values,
+                     dtype=dst_dtype, device=device).view(1, -1, 1)
     quant, scale = downcast_to_mxfp(x, torch.uint8, axis=1)
     dequant = upcast_from_mxfp(quant, scale, dst_dtype, axis=1)
     # Tie-breaking cases (RTNE):
@@ -42,7 +43,7 @@ def test_mxfp4_rounding_cases(dst_dtype, device):
     # - -1.25 is halfway between -1.0 and -1.5. RTNE selects -1.0 (even). Away-from-zero would pick -1.5;
     #   towards-zero would pick -1.0.
     # - two_point_five_plus_ulp is slightly bigger than 0.25, so it rounds to 0.5.
-    assert dequant.flatten().tolist() == [6, 0, 0, 0.0, 1.0, 1.0, 1.0, 1.5, -1.0, 0.5], f"{dequant=}"
+    assert dequant.flatten().tolist() == [6, 0, 0, 0.0, 1.0, 1.0, 1.0, 1.5, -1.0, 0.5] + pad_values, f"{dequant=}"
 
     quant_torch, scale_torch = downcast_to_mxfp_torch(x, torch.uint8, axis=1)
     assert_equal(quant_torch, quant)
@@ -56,7 +57,9 @@ def test_mxfp4_rounding_cases(dst_dtype, device):
     # 2**floor(log2(33/(e2m1 max power of 2 = 4)) = 2**3 = 8 (exponent 127+3),
     # and the other values are multiples of representable FP4 values times 8
     # that allow exact reconstruction.
-    x = torch.tensor([33.0, 24.0, 16.0, 8.0, 4.0, 0.0, -32.0, 0.0], device=device).bfloat16().view(1, -1, 1)
+    pad_values = [0] * 24
+    x = torch.tensor([33.0, 24.0, 16.0, 8.0, 4.0, 0.0, -32.0, 0.0] + pad_values,
+                     device=device).bfloat16().view(1, -1, 1)
     quant, scale = downcast_to_mxfp(
         x,
         torch.uint8,
@@ -88,7 +91,8 @@ def test_mxfp_extreme_values(src_dtype, dst_dtype, device):
     src_dtype = dtype_str_to_torch(src_dtype)
     dst_dtype = dtype_str_to_torch(dst_dtype)
     BIG_VALUE = 65470 if dst_dtype == torch.float16 else 3.3895e38
-    x = torch.tensor([BIG_VALUE, BIG_VALUE], dtype=dst_dtype, device=device)
+    pad_values = [0] * 30
+    x = torch.tensor([BIG_VALUE, BIG_VALUE] + pad_values, dtype=dst_dtype, device=device)
     xq_value, xq_scale = downcast_to_mxfp(x, src_dtype, axis=-1)
     xdq = upcast_from_mxfp(xq_value, xq_scale, dst_dtype, axis=-1)
     xdq_ref = upcast_from_mxfp_torch(xq_value, xq_scale, dst_dtype, axis=-1)
@@ -127,6 +131,7 @@ def test_mxfp_quant_dequant(src_dtype, dst_dtype, device):
     weight = weight.repeat((9, 32))  # Repeat the dimensions to test multi block launches.
     weight = weight.reshape([1, weight.shape[0], weight.shape[1]])
     weight = weight.mT.contiguous().mT
+    weight = torch.nn.functional.pad(weight, (0, 0, 0, 16))
     quant, scale = downcast_to_mxfp(weight, src_dtype, axis=1)
     dequant = upcast_from_mxfp(quant, scale, dst_dtype, axis=1)
     assert_equal(weight, dequant)
@@ -143,7 +148,7 @@ def test_mxfp_quant_dequant(src_dtype, dst_dtype, device):
         ((0, 0, 1024), 2, "float8_e4m3fn", DequantScaleRoundingMode.ROUND_DOWN),
 
         ((3, 4096, 1024), 1, "float4_e2m1", DequantScaleRoundingMode.ROUND_UP),
-        ((10, 254, 60), 0, "float4_e2m1", DequantScaleRoundingMode.ROUND_DOWN),
+        ((32, 254, 60), 0, "float4_e2m1", DequantScaleRoundingMode.ROUND_DOWN),
         ((1, 320, 160), 2, "float8_e5m2", DequantScaleRoundingMode.ROUND_UP),
         ((2, 16, 512), -1, "float8_e4m3fn", DequantScaleRoundingMode.ROUND_DOWN),
     ],
 
@@ -1,13 +1,14 @@
 import pytest
 import torch
-from triton_kernels.tensor import Bitmatrix, BIT
+from triton_kernels.tensor_details.dtype import BIT
 from triton_kernels.tensor import (
     make_ragged_tensor_metadata,
     make_ragged_tensor_metadata_torch,
     remap_ragged_tensor_metadata,
     remap_ragged_tensor_metadata_torch,
     make_bitmatrix_metadata,
     make_bitmatrix_metadata_torch,
+    wrap_torch_tensor,
 )
 from triton_kernels.testing import assert_equal
 
@@ -65,7 +66,7 @@ def test_make_bitmatrix_metadata(n_rows, n_cols, k):
     rows = torch.arange(n_rows, device=device).unsqueeze(1).expand_as(indx)
     bitmask_data = torch.zeros((n_rows, (n_cols + 31) // 32), dtype=torch.int32, device=device)
     bitmask_data.index_put_((rows, indx // 32), 1 << (indx % 32), accumulate=True)
-    bitmask = Bitmatrix(bitmask_data.view(torch.uint32), dtype=BIT, shape=(n_rows, n_cols))
+    bitmask = wrap_torch_tensor(bitmask_data.view(torch.uint32), dtype=BIT, shape=(n_rows, n_cols))
     # make metadata and compare
     metadata_tri = make_bitmatrix_metadata(indx, bitmask)
     metadata_ref = make_bitmatrix_metadata_torch(indx, bitmask)
 
@@ -20,16 +20,18 @@
 )
 def test_mxfp4_scale_roundtrip(shape):
     x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
-    layout = BlackwellMXScaleLayout(x.shape)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
+    layout = BlackwellMXScaleLayout()
+    transformation = layout.make_transformation(x.shape, is_fp4=False)
+    res = transformation.unswizzle_data(transformation.swizzle_data(x))
     assert (res == x).all()
 
 
 @pytest.mark.parametrize("shape", [(2, 256, 192), (1, 128, 64)])
 def test_act_scale_roundtrip_batched(shape):
     x = torch.randn(shape, device="cuda", dtype=torch.float32)
-    layout = BlackwellActMXScaleLayout(x.shape)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
+    layout = BlackwellActMXScaleLayout(ragged_metadata=None)
+    transformation = layout.make_transformation(x.shape, is_fp4=False)
+    res = transformation.unswizzle_data(transformation.swizzle_data(x))
     torch.testing.assert_close(res, x)
 
 
@@ -45,8 +47,9 @@ def test_act_scale_roundtrip_ragged(slice_sizes, m, k, align_m):
     m = max(m, slice_sizes.sum().item())  # there can be padded tokens in the input
     ragged_metadata = make_ragged_tensor_metadata(slice_sizes, m)
     x = torch.randn((m, k), device="cuda", dtype=torch.float32)
-    layout = BlackwellActMXScaleLayout((m, k), ragged_metadata=ragged_metadata)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
+    layout = BlackwellActMXScaleLayout(ragged_metadata=ragged_metadata)
+    transformation = layout.make_transformation(x.shape, is_fp4=False)
+    res = transformation.unswizzle_data(transformation.swizzle_data(x))
 
     x_useful_rows = x[ragged_metadata.slice_offs[:-1], :]
     res_useful_rows = res[ragged_metadata.slice_offs[:-1], :]
 
@@ -19,6 +19,7 @@
 )
 def test_mxfp4_scale_roundtrip(shape):
     x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
-    layout = CDNA4MXScaleLayout(x.shape)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
+    layout = CDNA4MXScaleLayout()
+    transformation = layout.make_transformation(x.shape, is_fp4=False)
+    res = transformation.unswizzle_data(transformation.swizzle_data(x))
     assert (res == x).all()
@@ -1,6 +1,6 @@
 import pytest
 from triton._internal_testing import is_cuda
-from triton_kernels.tensor import wrap_torch_tensor, convert_layout, FP4, get_layout
+from triton_kernels.tensor import wrap_torch_tensor, convert_layout, FP4
 from triton_kernels.tensor_details.layout import HopperMXScaleLayout, HopperMXValueLayout
 from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
 from triton_kernels.tensor_details.layout_details.hopper_value import mxfp4_to_bf16_triton
@@ -25,8 +25,11 @@ def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
         x = x.mT
     if x.shape[1 - mx_axis] < 32:
         pytest.skip("Not enough elements along non-mx axis")
-    layout = HopperMXValueLayout(x.shape, mx_axis, mma_version)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
+    layout = HopperMXValueLayout(mx_axis, mma_version)
+    shape = list(x.shape)
+    shape[-1] *= 2
+    transformation = layout.make_transformation(shape, is_fp4=False)
+    res = transformation.unswizzle_data(transformation.swizzle_data(x))
     assert (res == x).all()
 
 
@@ -35,8 +38,9 @@ def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
 @pytest.mark.parametrize("shape", [(256, 64), (256, 128), (256, 256)])
 def test_mxfp4_scale_roundtrip(shape, mx_axis, num_warps):
     x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
-    layout = HopperMXScaleLayout(x.shape, mx_axis=mx_axis, num_warps=num_warps)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
+    layout = HopperMXScaleLayout(mx_axis=mx_axis, num_warps=num_warps)
+    transformation = layout.make_transformation(x.shape, is_fp4=False)
+    res = transformation.unswizzle_data(transformation.swizzle_data(x))
     assert (res[:shape[0], :shape[1]] == x).all()
 
 
@@ -85,13 +89,13 @@ def test_upcast_mxfp4_to_bf16(num_warps, mx_axis):
     x_bf16 = upcast_from_mxfp(x_fp4_val, x_fp4_scale, x.dtype, axis=mx_axis)
     x_fp4_val = wrap_torch_tensor(x_fp4_val, dtype=FP4)
     x_fp4_scale = wrap_torch_tensor(x_fp4_scale)
-    x_fp4_val = convert_layout(x_fp4_val, HopperMXValueLayout, mx_axis=mx_axis)
-    x_fp4_scale = convert_layout(x_fp4_scale, HopperMXScaleLayout, mx_axis=mx_axis, num_warps=num_warps)
+    x_fp4_val = convert_layout(x_fp4_val, HopperMXValueLayout(mx_axis=mx_axis - 2, mma_version=3))
+    x_fp4_scale = convert_layout(x_fp4_scale, HopperMXScaleLayout(mx_axis=mx_axis - 2, num_warps=num_warps))
     y = torch.empty_like(x_bf16)
     scale_block = [s // 32 if i == mx_axis else s for i, s in enumerate(shape)]
-    scale_block = get_layout(x_fp4_scale).swizzle_block_shape(scale_block)
+    scale_block = x_fp4_scale.storage.layout.swizzle_block_shape(scale_block)
     value_block = [s // 2 if i == mx_axis else s for i, s in enumerate(shape)]
-    value_block = get_layout(x_fp4_val).swizzle_block_shape(value_block)
+    value_block = x_fp4_val.storage.layout.swizzle_block_shape(value_block)
     _upcast_mxfp4_to_bf16[(1, )](
         y, x_fp4_val.storage.data, x_fp4_scale.storage.data,  #
         x_fp4_val.storage.data.stride(0), x_fp4_val.storage.data.stride(1),  #
 
@@ -1,6 +1,6 @@
 import torch
 from .compaction_details._masked_compaction import _masked_compaction
-from .tensor import Bitmatrix
+from .tensor import Tensor
 
 
 def compaction(yv, yi, bitmask, sentinel=-1):
@@ -32,7 +32,7 @@ def compaction(yv, yi, bitmask, sentinel=-1):
     n_rows, n_cols = yi.shape
     ret_yv = torch.empty_like(yv)
     ret_yi = torch.empty_like(yi)
-    if isinstance(bitmask, Bitmatrix):
+    if isinstance(bitmask, Tensor):
         bitmask = bitmask.storage.data
 
     _masked_compaction[(n_rows, )](