[TRITON_KERNELS] some more tweaks (#9350)

ptillet · web-flow · commit dad63bd0b180 · 2026-02-02T22:09:43.000-08:00
diff --git a/python/triton_kernels/triton_kernels/tensor.py b/python/triton_kernels/triton_kernels/tensor.py
@@ -9,7 +9,8 @@
 from .tensor_details import ragged_tensor as ragged_tensor_details
 from .tensor_details.layout import BlackwellMXValueLayout, Layout, StridedLayout
 from .tensor_details.ragged_tensor import RaggedTensorMetadata
-from .tensor_details.dtype import IntegerType, FloatType, DataType, FP4, UINT8, FP8_E4M3FN, FP8_E4M3FNUZ, FP8_E5M2, FP16, BF16, FP32, FP64
+from .tensor_details.dtype import IntegerType, FloatType, DataType
+from .tensor_details.dtype import FP4, UINT8, FP8_E4M3FN, FP8_E4M3FNUZ, FP8_E5M2, FP16, BF16, FP32, FP64, INT16, INT32, INT64
 
 
 # storage
@@ -246,6 +247,9 @@ def dtype_to_torch_dtype(dtype: DataType) -> torch.dtype:
         FP32: torch.float32,
         FP16: torch.float16,
         FP64: torch.float64,
+        INT16: torch.int16,
+        INT32: torch.int32,
+        INT64: torch.int64,
     }[dtype]
 
 
@@ -262,6 +266,9 @@ def torch_dtype_to_dtype(dtype: torch.dtype) -> DataType:
         "bfloat16": BF16,
         "float32": FP32,
         "float64": FP64,
+        "int16": INT16,
+        "int32": INT32,
+        "int64": INT64,
     }
     if id in vals:
         return vals[id]
@@ -270,15 +277,13 @@ def torch_dtype_to_dtype(dtype: torch.dtype) -> DataType:
     assert False, f"Unknown dtype: {id}"
 
 
-def empty(shape: tuple[int], dtype: DataType, device: torch.device, layout=None):
+def empty(shape: tuple[int], dtype: DataType, device: torch.device, layout=None,
+          allow_implicit_conversion: bool = False):
     storage_shape = list(shape)
     storage_dtype = torch.uint8 if dtype == FP4 else dtype_to_torch_dtype(dtype)
+    initial_layout = layout if isinstance(layout, StridedLayout) else StridedLayout()
     # pack sub-byte datatype along last dimension
-    if layout is None:
-        layout = StridedLayout()
-    # storage shape
-    assert isinstance(layout, StridedLayout)
-    order = layout.order(len(storage_shape))
+    order = initial_layout.order(len(storage_shape))
     dim = order[0]
     storage_shape[dim] = storage_shape[dim] // (storage_dtype.itemsize * 8 // dtype.bitwidth)
     # storage strides
@@ -288,4 +293,8 @@ def empty(shape: tuple[int], dtype: DataType, device: torch.device, layout=None)
         strides[d] = running
         running *= storage_shape[d]
     storage = torch.empty_strided(storage_shape, strides, device=device, dtype=storage_dtype)
-    return wrap_torch_tensor(storage, dtype=dtype, shape=shape, layout=layout)
+    ret = wrap_torch_tensor(storage, dtype=dtype, shape=shape, layout=initial_layout)
+    assert initial_layout == ret.storage.layout or allow_implicit_conversion
+    if allow_implicit_conversion:
+        ret = convert_layout(ret, layout)
+    return ret
diff --git a/python/triton_kernels/triton_kernels/tensor_details/dtype.py b/python/triton_kernels/triton_kernels/tensor_details/dtype.py
@@ -32,5 +32,8 @@ def bitwidth(self):
 FP16 = FloatType(bitwidth_exponent=5, bitwidth_mantissa=10, is_signed=True)
 FP32 = FloatType(bitwidth_exponent=8, bitwidth_mantissa=23, is_signed=True)
 FP64 = FloatType(bitwidth_exponent=11, bitwidth_mantissa=52, is_signed=True)
+INT16 = IntegerType(16, is_signed=True)
+INT32 = IntegerType(32, is_signed=True)
+INT64 = IntegerType(64, is_signed=True)
 
 DataType: TypeAlias = IntegerType | FloatType
diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py
@@ -170,8 +170,8 @@ def topk_torch(
     if apply_softmax:
         y_vals = torch.softmax(y_vals.float(), dim=-1).to(x.dtype)
     if not has_user_provided_indx:
-        y_indx, sort_indices = torch.sort(y_indx, dim=1)
-        y_vals = torch.gather(y_vals, 1, sort_indices)
+        y_vals, sort_indices = torch.sort(y_vals.float(), dim=1, descending=True, stable=True)
+        y_indx = torch.gather(y_indx, 1, sort_indices)
     y_indx[n_rows:, :] = -1
     rows = torch.arange(x.shape[0], device=device).unsqueeze(1).expand(-1, y_indx.shape[1]).reshape(-1)
     cols = y_indx.reshape(-1)  # 64-bit safe for div/mod
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk_forward.py b/python/triton_kernels/triton_kernels/topk_details/_topk_forward.py
@@ -71,16 +71,14 @@ def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.co
         x = (x.to(x_ultype) << 16) | indx_to_key(offs_x_n, N_EXPTS_PAD)[None, :]
         acc = tl.maximum(acc, tl.topk(x, N_EXPTS_ACT, dim=1))
 
-    # rotate expert index into upper 16 bits:
-    # 0000vvvvvvvviiii --> iiii0000vvvvvvvv
-    acc = (acc << (y_nbits - 16)) | (acc >> 16)
-    # sort in ascending order of expert (descending order of key)
+    # sort packed (value_key, index_key) descending:
+    # this keeps outputs ordered by gate value and uses smaller expert index for ties
     acc = tl.sort(acc, dim=1, descending=True)
-    # iiii0000vvvvvvvv --> 0000iiii:
-    y_indices_raw = (acc >> (y_nbits - 16)).to(tl.uint32)
+    # 0000vvvvvvvviiii --> 0000iiii:
+    y_indices_raw = (acc & 0xFFFF).to(tl.uint32)
     y_indices = key_to_indx(y_indices_raw, N_EXPTS_PAD)
-    # iiii0000vvvvvvvv --> vvvvvvvv:
-    y_values_raw = acc.to(x_utype)
+    # 0000vvvvvvvviiii --> vvvvvvvv:
+    y_values_raw = (acc >> 16).to(x_utype)
     y_values = key_to_fpval(y_values_raw).to(x_dtype, bitcast=True)
 
     return y_values, y_indices