pytorch
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_unpad_token_groups.py‎
Lines changed: 258 additions & 0 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_unpad_token_groups.py‎
Lines changed: 258 additions & 0 deletions
diff --git a/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 66 additions & 0 deletions b/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎test/prototype/moe_training/test_mxfp8_grouped_mm.py‎
Lines changed: 14 additions & 1 deletion b/‎test/prototype/moe_training/test_mxfp8_grouped_mm.py‎
Lines changed: 14 additions & 1 deletion
@@ -0,0 +1,258 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
+
+import argparse
+import itertools
+import time
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from tqdm import tqdm
+
+from benchmarks.utils import profile_fn
+from torchao.prototype.moe_training.kernels.mxfp8 import (
+    _mxfp8_cuda_kernels_available,
+    fused_unpad_token_groups_cuda,
+    torch_pad_token_groups,
+    torch_unpad_token_groups,
+)
+from torchao.prototype.moe_training.utils import generate_jagged_offs
+
+device = torch.device("cuda")
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    num_tokens: int
+    dim: int
+    num_groups: int
+    alignment_size: int
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    torch_eager_time_us: float
+    cuda_time_us: float
+    torch_mem_bw_gbps: float
+    cuda_mem_bw_gbps: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    # Various token group sizes and dimensions
+    num_tokens_list = [16384]
+    dim_list = [1536, 2048, 5120, 7168]
+    num_groups_list = [1, 4, 8, 16]
+    alignment_size_list = [32]
+
+    configs = []
+    for num_tokens, dim, num_groups, alignment_size in itertools.product(
+        num_tokens_list, dim_list, num_groups_list, alignment_size_list
+    ):
+        configs.append(
+            ExperimentConfig(
+                num_tokens=num_tokens,
+                dim=dim,
+                num_groups=num_groups,
+                alignment_size=alignment_size,
+            )
+        )
+    return configs
+
+
+def benchmark_host_side_in_microseconds(fn, *args, num_iters=100, **kwargs):
+    """
+    Benchmark using host-side timing, includes buffer allocation overhead.
+    """
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for _ in range(num_iters):
+        fn(*args, **kwargs)
+    torch.cuda.synchronize()
+    end = time.perf_counter()
+    return ((end - start) / num_iters) * 1e6  # Convert to microseconds
+
+
+def run_experiment(
+    config: ExperimentConfig, args: argparse.Namespace
+) -> ExperimentResult:
+    num_tokens, dim, num_groups, alignment_size = (
+        config.num_tokens,
+        config.dim,
+        config.num_groups,
+        config.alignment_size,
+    )
+
+    # Create inputs and pad them first
+    inputs = torch.randn(num_tokens, dim, dtype=torch.bfloat16, device=device)
+    group_offsets = generate_jagged_offs(
+        num_groups, num_tokens, multiple_of=1, device=device
+    )
+
+    # Pad the inputs to get padded tensors for unpad benchmark
+    padded_inputs, padded_group_end_offsets = torch_pad_token_groups(
+        inputs, group_offsets, alignment_size
+    )
+
+    # Compute padded group start offsets
+    group_sizes = torch.diff(
+        group_offsets,
+        prepend=torch.zeros(1, dtype=group_offsets.dtype, device=group_offsets.device),
+    )
+    padded_sizes = (
+        (group_sizes + alignment_size - 1) // alignment_size
+    ) * alignment_size
+    padded_group_start_offsets = padded_group_end_offsets - padded_sizes
+
+    def torch_eager_with_offsets():
+        return torch_unpad_token_groups(
+            padded_inputs, group_offsets, padded_group_start_offsets, alignment_size
+        )
+
+    def warmup(fn):
+        for _ in range(5):
+            fn()
+
+    # bench torch eager (includes buffer allocation overhead)
+    warmup(torch_eager_with_offsets)
+    torch_eager_time_us = benchmark_host_side_in_microseconds(torch_eager_with_offsets)
+    if args.profile:
+        profile_fn(
+            torch_unpad_token_groups,
+            padded_inputs,
+            group_offsets,
+            padded_group_start_offsets,
+            alignment_size,
+            profile_name="torch_unpad_token_groups_eager",
+        )
+
+    # bench CUDA kernel if available
+    if _mxfp8_cuda_kernels_available:
+
+        def cuda_with_offsets():
+            return fused_unpad_token_groups_cuda(
+                padded_inputs,
+                group_offsets,
+                padded_group_start_offsets,
+                num_tokens,
+                alignment_size,
+            )
+
+        warmup(cuda_with_offsets)
+        cuda_time_us = benchmark_host_side_in_microseconds(cuda_with_offsets)
+        if args.profile:
+            profile_fn(
+                fused_unpad_token_groups_cuda,
+                padded_inputs,
+                group_offsets,
+                padded_group_start_offsets,
+                num_tokens,
+                alignment_size,
+                profile_name="fused_unpad_token_groups_cuda",
+            )
+    else:
+        cuda_time_us = float("inf")  # Not available
+
+    # mem bw calculations
+    bytes_per_el = torch.finfo(torch.bfloat16).bits / 8
+
+    read_bytes = (
+        padded_inputs.numel() * bytes_per_el  # Read padded input tokens
+        + group_offsets.numel() * 4  # Read group offsets (int32)
+        + padded_group_start_offsets.numel() * 4  # Read padded start offsets (int32)
+    )
+
+    write_bytes = (
+        inputs.numel() * bytes_per_el  # Write unpadded data
+    )
+
+    total_bytes = read_bytes + write_bytes
+
+    torch_mem_bw_gbps = (total_bytes / 1e9) / (torch_eager_time_us / 1e6)
+
+    if _mxfp8_cuda_kernels_available and cuda_time_us != float("inf"):
+        cuda_mem_bw_gbps = (total_bytes / 1e9) / (cuda_time_us / 1e6)
+    else:
+        cuda_mem_bw_gbps = 0.0
+
+    return ExperimentResult(
+        torch_eager_time_us=torch_eager_time_us,
+        cuda_time_us=cuda_time_us,
+        torch_mem_bw_gbps=torch_mem_bw_gbps,
+        cuda_mem_bw_gbps=cuda_mem_bw_gbps,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "num_tokens",
+        "dim",
+        "num_groups",
+        "torch_us",
+        "cuda_us",
+        "torch_mem_bw_gbps",
+        "cuda_mem_bw_gbps",
+        "cuda_vs_torch",
+    ]
+    rows = []
+    for experiment in experiments:
+        cuda_time = experiment.result.cuda_time_us
+        cuda_vs_torch = (
+            f"{experiment.result.torch_eager_time_us / cuda_time:.2f}x"
+            if cuda_time != float("inf") and cuda_time > 0
+            else "N/A"
+        )
+        cuda_bw_str = (
+            f"{experiment.result.cuda_mem_bw_gbps:.2f}"
+            if experiment.result.cuda_mem_bw_gbps > 0
+            else "N/A"
+        )
+
+        rows.append(
+            [
+                experiment.config.num_tokens,
+                experiment.config.dim,
+                experiment.config.num_groups,
+                experiment.result.torch_eager_time_us,
+                experiment.result.cuda_time_us,
+                f"{experiment.result.torch_mem_bw_gbps:.2f}",
+                cuda_bw_str,
+                cuda_vs_torch,
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def main(args: argparse.Namespace):
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config, args)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--profile", action="store_true", help="Enable profiling with PyTorch profiler"
+    )
+    args = parser.parse_args()
+    main(args)
@@ -23,12 +23,14 @@
 from torchao.prototype.moe_training.kernels.mxfp8 import (
     _mxfp8_cuda_kernels_available,
     fused_pad_token_groups_cuda,
+    fused_unpad_token_groups_cuda,
     mx_block_rearrange_2d_M_groups_cuda,
     mxfp8_quantize_cuda_3d,
     torch_pad_token_groups,
     torch_to_blocked_2d_K_groups,
     torch_to_blocked_2d_M_groups,
     torch_to_blocked_per_group_3d,
+    torch_unpad_token_groups,
     triton_mx_block_rearrange_2d_K_groups,
     triton_mx_block_rearrange_2d_M_groups,
     triton_mx_block_rearrange_per_group_3d,
@@ -452,3 +454,67 @@ def test_cuda_fused_pad_token_groups(
     assert torch.equal(ref_padded_offsets, kernel_padded_offsets), (
         "Padded group offsets do not match"
     )
+
+
+@pytest.mark.skipif(
+    not _mxfp8_cuda_kernels_available,
+    reason="CUDA kernel requires sm_100 and CUDA 12.8+",
+)
+@skip_if_rocm("ROCm enablement in progress")
+@pytest.mark.parametrize("num_tokens", [128, 157, 4096])
+@pytest.mark.parametrize("dim", [7168])
+@pytest.mark.parametrize("num_groups", [1, 2, 4, 8])
+@pytest.mark.parametrize("alignment_size", [32])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_cuda_fused_unpad_token_groups(
+    num_tokens: int, dim: int, num_groups: int, alignment_size: int, dtype: torch.dtype
+):
+    """Test fused_unpad_token_groups_cuda kernel for removing padding from token groups."""
+    device = "cuda"
+
+    # Create input activations
+    inputs = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+
+    # Generate group offsets (end indices for each group)
+    group_offsets = generate_jagged_offs(
+        num_groups, num_tokens, multiple_of=1, device=device
+    )
+
+    # First pad the tokens to create padded inputs
+    padded_tokens, padded_group_end_offsets = torch_pad_token_groups(
+        inputs, group_offsets, alignment_size
+    )
+
+    # Compute padded group start offsets
+    group_sizes = torch.diff(
+        group_offsets,
+        prepend=torch.zeros(1, dtype=group_offsets.dtype, device=group_offsets.device),
+    )
+    padded_sizes = (
+        (group_sizes + alignment_size - 1) // alignment_size
+    ) * alignment_size
+    padded_group_start_offsets = padded_group_end_offsets - padded_sizes
+
+    # Get reference output using torch implementation
+    ref_unpadded_tokens = torch_unpad_token_groups(
+        padded_tokens, group_offsets, padded_group_start_offsets, alignment_size
+    )
+
+    # Run CUDA kernel
+    kernel_unpadded_tokens = fused_unpad_token_groups_cuda(
+        padded_tokens,
+        group_offsets,
+        padded_group_start_offsets,
+        num_tokens,
+        alignment_size,
+    )
+
+    # Verify outputs match
+    assert torch.allclose(
+        ref_unpadded_tokens, kernel_unpadded_tokens, rtol=0, atol=1e-5
+    ), "Unpadded tokens do not match"
+
+    # Verify that unpad correctly reverses pad operation
+    assert torch.allclose(inputs, kernel_unpadded_tokens, rtol=0, atol=1e-5), (
+        "Unpadded tokens should match original inputs"
+    )
@@ -126,6 +126,7 @@ def test_emulate_mxfp8_grouped_gemm_2d_2d(M, N, num_experts):
 @pytest.mark.parametrize("num_experts", (1, 8))
 @pytest.mark.parametrize("wgrad_with_hp", (True, False))
 @pytest.mark.parametrize("use_compile", (False, True))
+@pytest.mark.parametrize("pad_token_groups_for_grouped_mm", (False, True))
 @pytest.mark.parametrize(
     "kernel_preference", (KernelPreference.AUTO, KernelPreference.EMULATED)
 )
@@ -141,12 +142,21 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
     use_compile,
     kernel_preference,
     scale_mode,
+    pad_token_groups_for_grouped_mm,
 ):
     # MXFP8 hardware path requires SM100
     if kernel_preference != KernelPreference.EMULATED and not is_sm_version(10, 0):
         pytest.skip(
             f"Skipping MXFP8 hardware mode tests, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
         )
+    if (
+        kernel_preference == KernelPreference.EMULATED
+        and use_compile
+        and pad_token_groups_for_grouped_mm
+    ):
+        pytest.skip(
+            f"torch native dynamic per group pad/unpad functions do not work with torch.compile yet: https://github.com/pytorch/pytorch/issues/176770"
+        )
 
     block_size = 32
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda", requires_grad=True)
@@ -158,7 +168,9 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
         device="cuda",
     )
     w_t = w.transpose(-2, -1).requires_grad_(True)
-    offs = generate_jagged_offs(num_experts, M, multiple_of=block_size)
+
+    multiple_of = 1 if pad_token_groups_for_grouped_mm else 32
+    offs = generate_jagged_offs(num_experts, M, multiple_of=multiple_of)
     x_ref, w_t_ref, offs_ref = (
         x.clone().detach().requires_grad_(True),
         w_t.clone().detach().requires_grad_(True),
@@ -179,6 +191,7 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
         kernel_preference=kernel_preference,
         wgrad_with_hp=wgrad_with_hp,
         scale_calculation_mode=scale_mode,
+        pad_token_groups_for_grouped_mm=pad_token_groups_for_grouped_mm,
     )
     ref_out = torch._grouped_mm(x_ref, w_t_ref, offs=offs_ref, out_dtype=torch.bfloat16)
     sqnr = compute_error(ref_out, out)