pytorch
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_pad_token_groups.py‎
Lines changed: 5 additions & 3 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_pad_token_groups.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_unpad_token_groups.py‎
Lines changed: 252 additions & 0 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_unpad_token_groups.py‎
Lines changed: 252 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 32 additions & 32 deletions b/‎setup.py‎
Lines changed: 32 additions & 32 deletions
diff --git a/‎test/prototype/moe_training/reference_moe.py‎
Lines changed: 8 additions & 2 deletions b/‎test/prototype/moe_training/reference_moe.py‎
Lines changed: 8 additions & 2 deletions
@@ -102,7 +102,9 @@ def torch_eager_with_offsets():
         group_offsets = generate_jagged_offs(
             num_groups, num_tokens, multiple_of=1, device=device
         )
-        return torch_pad_token_groups(inputs, group_offsets, alignment_size)
+        return torch_pad_token_groups(
+            inputs, group_offsets, alignment_size
+        )  # Returns 3 values
 
     def warmup(fn):
         for _ in range(5):
@@ -152,8 +154,8 @@ def cuda_with_offsets():
     group_offsets = generate_jagged_offs(
         num_groups, num_tokens, multiple_of=1, device=device
     )
-    torch_padded_tokens, torch_padded_offsets = torch_pad_token_groups(
-        inputs, group_offsets, alignment_size
+    torch_padded_tokens, torch_padded_start_offsets, torch_padded_offsets = (
+        torch_pad_token_groups(inputs, group_offsets, alignment_size)
     )
 
     bytes_per_el = torch.finfo(torch.bfloat16).bits / 8
 
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
+
+import argparse
+import itertools
+import time
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from tqdm import tqdm
+
+from benchmarks.utils import profile_fn
+from torchao.prototype.moe_training.kernels.mxfp8 import (
+    _mxfp8_cuda_kernels_available,
+    fused_unpad_token_groups_cuda,
+    torch_pad_token_groups,
+    torch_unpad_token_groups,
+)
+from torchao.prototype.moe_training.utils import generate_jagged_offs
+
+device = torch.device("cuda")
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    num_tokens: int
+    dim: int
+    num_groups: int
+    alignment_size: int
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    torch_eager_time_us: float
+    cuda_time_us: float
+    torch_mem_bw_gbps: float
+    cuda_mem_bw_gbps: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    # Various token group sizes and dimensions
+    num_tokens_list = [16384]
+    dim_list = [1536, 2048, 5120, 7168]
+    num_groups_list = [1, 4, 8, 16]
+    alignment_size_list = [32]
+
+    configs = []
+    for num_tokens, dim, num_groups, alignment_size in itertools.product(
+        num_tokens_list, dim_list, num_groups_list, alignment_size_list
+    ):
+        configs.append(
+            ExperimentConfig(
+                num_tokens=num_tokens,
+                dim=dim,
+                num_groups=num_groups,
+                alignment_size=alignment_size,
+            )
+        )
+    return configs
+
+
+def benchmark_host_side_in_microseconds(fn, *args, num_iters=100, **kwargs):
+    """
+    Benchmark using host-side timing, includes buffer allocation overhead.
+    """
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for _ in range(num_iters):
+        fn(*args, **kwargs)
+    torch.cuda.synchronize()
+    end = time.perf_counter()
+    return ((end - start) / num_iters) * 1e6  # Convert to microseconds
+
+
+def run_experiment(
+    config: ExperimentConfig, args: argparse.Namespace
+) -> ExperimentResult:
+    num_tokens, dim, num_groups, alignment_size = (
+        config.num_tokens,
+        config.dim,
+        config.num_groups,
+        config.alignment_size,
+    )
+
+    # Create inputs and pad them first
+    inputs = torch.randn(num_tokens, dim, dtype=torch.bfloat16, device=device)
+    group_offsets = generate_jagged_offs(
+        num_groups, num_tokens, multiple_of=1, device=device
+    )
+
+    # Pad the inputs to get padded tensors for unpad benchmark
+    padded_inputs, padded_group_start_offsets, padded_group_end_offsets = (
+        torch_pad_token_groups(inputs, group_offsets, alignment_size)
+    )
+
+    def torch_eager_with_offsets():
+        return torch_unpad_token_groups(
+            padded_inputs,
+            group_offsets,
+            padded_group_start_offsets,
+            num_tokens,
+            alignment_size,
+        )
+
+    def warmup(fn):
+        for _ in range(5):
+            fn()
+
+    # bench torch eager (includes buffer allocation overhead)
+    warmup(torch_eager_with_offsets)
+    torch_eager_time_us = benchmark_host_side_in_microseconds(torch_eager_with_offsets)
+    if args.profile:
+        profile_fn(
+            torch_unpad_token_groups,
+            padded_inputs,
+            group_offsets,
+            padded_group_start_offsets,
+            alignment_size,
+            profile_name="torch_unpad_token_groups_eager",
+        )
+
+    # bench CUDA kernel if available
+    if _mxfp8_cuda_kernels_available:
+
+        def cuda_with_offsets():
+            return fused_unpad_token_groups_cuda(
+                padded_inputs,
+                group_offsets,
+                padded_group_start_offsets,
+                num_tokens,
+                alignment_size,
+            )
+
+        warmup(cuda_with_offsets)
+        cuda_time_us = benchmark_host_side_in_microseconds(cuda_with_offsets)
+        if args.profile:
+            profile_fn(
+                fused_unpad_token_groups_cuda,
+                padded_inputs,
+                group_offsets,
+                padded_group_start_offsets,
+                num_tokens,
+                alignment_size,
+                profile_name="fused_unpad_token_groups_cuda",
+            )
+    else:
+        cuda_time_us = float("inf")  # Not available
+
+    # mem bw calculations
+    bytes_per_el = torch.finfo(torch.bfloat16).bits / 8
+
+    read_bytes = (
+        padded_inputs.numel() * bytes_per_el  # Read padded input tokens
+        + group_offsets.numel() * 4  # Read group offsets (int32)
+        + padded_group_start_offsets.numel() * 4  # Read padded start offsets (int32)
+    )
+
+    write_bytes = (
+        inputs.numel() * bytes_per_el  # Write unpadded data
+    )
+
+    total_bytes = read_bytes + write_bytes
+
+    torch_mem_bw_gbps = (total_bytes / 1e9) / (torch_eager_time_us / 1e6)
+
+    if _mxfp8_cuda_kernels_available and cuda_time_us != float("inf"):
+        cuda_mem_bw_gbps = (total_bytes / 1e9) / (cuda_time_us / 1e6)
+    else:
+        cuda_mem_bw_gbps = 0.0
+
+    return ExperimentResult(
+        torch_eager_time_us=torch_eager_time_us,
+        cuda_time_us=cuda_time_us,
+        torch_mem_bw_gbps=torch_mem_bw_gbps,
+        cuda_mem_bw_gbps=cuda_mem_bw_gbps,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "num_tokens",
+        "dim",
+        "num_groups",
+        "torch_us",
+        "cuda_us",
+        "torch_mem_bw_gbps",
+        "cuda_mem_bw_gbps",
+        "cuda_vs_torch",
+    ]
+    rows = []
+    for experiment in experiments:
+        cuda_time = experiment.result.cuda_time_us
+        cuda_vs_torch = (
+            f"{experiment.result.torch_eager_time_us / cuda_time:.2f}x"
+            if cuda_time != float("inf") and cuda_time > 0
+            else "N/A"
+        )
+        cuda_bw_str = (
+            f"{experiment.result.cuda_mem_bw_gbps:.2f}"
+            if experiment.result.cuda_mem_bw_gbps > 0
+            else "N/A"
+        )
+
+        rows.append(
+            [
+                experiment.config.num_tokens,
+                experiment.config.dim,
+                experiment.config.num_groups,
+                experiment.result.torch_eager_time_us,
+                experiment.result.cuda_time_us,
+                f"{experiment.result.torch_mem_bw_gbps:.2f}",
+                cuda_bw_str,
+                cuda_vs_torch,
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def main(args: argparse.Namespace):
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config, args)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--profile", action="store_true", help="Enable profiling with PyTorch profiler"
+    )
+    args = parser.parse_args()
+    main(args)
@@ -787,38 +787,38 @@ def get_extensions():
 
     # Only build the cutlass_90a extension if sm90a is in the architecture flags
     # and if torch version >= 2.10
-    if (
-        cutlass_90a_sources is not None
-        and len(cutlass_90a_sources) > 0
-        and build_for_sm90a
-        and _torch_version_at_least("2.10.0")
-    ):
-        cutlass_90a_extra_compile_args = copy.deepcopy(extra_compile_args)
-        # Only use sm90a architecture for these sources, ignoring other flags
-        cutlass_90a_extra_compile_args["nvcc"].extend(
-            [
-                "-DUSE_CUDA",
-                "-gencode=arch=compute_90a,code=sm_90a",
-                "-DTORCH_TARGET_VERSION=0x020a000000000000",
-            ]
-        )
-        # Add compile flags for stable ABI support (requires torch >= 2.10)
-        cutlass_90a_extra_compile_args["cxx"].extend(
-            [
-                "-DUSE_CUDA",
-                "-DTORCH_TARGET_VERSION=0x020a000000000000",
-            ]
-        )
-        # stable ABI cutlass_90a module
-        ext_modules.append(
-            extension(
-                "torchao._C_cutlass_90a",
-                cutlass_90a_sources,
-                py_limited_api=True,
-                extra_compile_args=cutlass_90a_extra_compile_args,
-                extra_link_args=extra_link_args,
-            )
-        )
+    # if (
+    #     cutlass_90a_sources is not None
+    #     and len(cutlass_90a_sources) > 0
+    #     and build_for_sm90a
+    #     and _torch_version_at_least("2.10.0")
+    # ):
+    #     cutlass_90a_extra_compile_args = copy.deepcopy(extra_compile_args)
+    #     # Only use sm90a architecture for these sources, ignoring other flags
+    #     cutlass_90a_extra_compile_args["nvcc"].extend(
+    #         [
+    #             "-DUSE_CUDA",
+    #             "-gencode=arch=compute_90a,code=sm_90a",
+    #             "-DTORCH_TARGET_VERSION=0x020a000000000000",
+    #         ]
+    #     )
+    #     # Add compile flags for stable ABI support (requires torch >= 2.10)
+    #     cutlass_90a_extra_compile_args["cxx"].extend(
+    #         [
+    #             "-DUSE_CUDA",
+    #             "-DTORCH_TARGET_VERSION=0x020a000000000000",
+    #         ]
+    #     )
+    #     # stable ABI cutlass_90a module
+    #     ext_modules.append(
+    #         extension(
+    #             "torchao._C_cutlass_90a",
+    #             cutlass_90a_sources,
+    #             py_limited_api=True,
+    #             extra_compile_args=cutlass_90a_extra_compile_args,
+    #             extra_link_args=extra_link_args,
+    #         )
+    #     )
 
     # Build CMakeLists from /torchao/csrc/cpu - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
     if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1":
 
@@ -147,6 +147,12 @@ def generate_permute_indices(
         torch.int32
     )
 
+    # Ensure m_sizes sums to exactly max_len (the actual data size after permutation)
+    current_sum = m_sizes.sum().item()
+    if current_sum != max_len:
+        # Add the difference to the last expert
+        m_sizes[-1] = m_sizes[-1] + (max_len - current_sum)
+
     m_offsets = torch.cumsum(m_sizes, 0)
     write_offsets = m_offsets - m_sizes
 
@@ -176,8 +182,8 @@ def generate_permute_indices(
 # Utils from torchtitan/models/moe/utils.py
 # =============================================================================
 
-TOKEN_GROUP_ALIGN_SIZE_M = 8
-ValidTokenGroupAlignmentSize = Literal[8, 16, 32]
+TOKEN_GROUP_ALIGN_SIZE_M = 1
+ValidTokenGroupAlignmentSize = Literal[1, 16, 32]
 
 
 def set_token_group_alignment_size_m(
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,9 @@ def torch_eager_with_offsets():`
`102`	`102`	`group_offsets = generate_jagged_offs(`
`103`	`103`	`num_groups, num_tokens, multiple_of=1, device=device`
`104`	`104`	`)`
`105`		`- return torch_pad_token_groups(inputs, group_offsets, alignment_size)`
	`105`	`+ return torch_pad_token_groups(`
	`106`	`+ inputs, group_offsets, alignment_size`
	`107`	`+ ) # Returns 3 values`
`106`	`108`
`107`	`109`	`def warmup(fn):`
`108`	`110`	`for _ in range(5):`
`@@ -152,8 +154,8 @@ def cuda_with_offsets():`
`152`	`154`	`group_offsets = generate_jagged_offs(`
`153`	`155`	`num_groups, num_tokens, multiple_of=1, device=device`
`154`	`156`	`)`
`155`		`- torch_padded_tokens, torch_padded_offsets = torch_pad_token_groups(`
`156`		`- inputs, group_offsets, alignment_size`
	`157`	`+ torch_padded_tokens, torch_padded_start_offsets, torch_padded_offsets = (`
	`158`	`+ torch_pad_token_groups(inputs, group_offsets, alignment_size)`
`157`	`159`	`)`
`158`	`160`
`159`	`161`	`bytes_per_el = torch.finfo(torch.bfloat16).bits / 8`