pytorch
diff --git a/‎benchmarks/float8/float8_inference_roofline.py‎
Lines changed: 21 additions & 4 deletions b/‎benchmarks/float8/float8_inference_roofline.py‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 33 additions & 3 deletions b/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 33 additions & 3 deletions
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_pad_token_groups.py‎
Lines changed: 10 additions & 20 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_pad_token_groups.py‎
Lines changed: 10 additions & 20 deletions
@@ -112,7 +112,7 @@ def get_gemm_times(
 
     bf16_time_s = get_gpu_kernel_gemm_time_s(torch.mm, x_bf16, w_bf16)
 
-    if recipe_name in ("mxfp4_cutlass", "nvfp4"):
+    if recipe_name in ("mxfp4_cutlass", "nvfp4", "nvfp4_static"):
         d1, d2, d3 = torch.float4_e2m1fn_x2, torch.float4_e2m1fn_x2, torch.bfloat16
         A = torch.randint(0, 255, (M, K // 2), device=device, dtype=torch.uint8).view(
             d1
@@ -151,7 +151,7 @@ def get_gemm_times(
         scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         scale_a = to_blocked(scale_a)
         scale_b = to_blocked(scale_b)
-    elif recipe_name == "nvfp4":
+    elif recipe_name in ("nvfp4", "nvfp4_static"):
         scale_a = torch.ones(M, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_b = torch.ones(N, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_a = to_blocked(scale_a)
@@ -177,7 +177,7 @@ def do_matmul(A, B):
                 swizzle_b=SwizzleType.SWIZZLE_32_4_4,
                 output_dtype=d3,
             )
-        if recipe_name == "nvfp4":
+        if recipe_name in ("nvfp4", "nvfp4_static"):
             return torch._scaled_mm(
                 A, B, scale_a, scale_b, out_dtype=d3, use_fast_accum=False
             )
@@ -795,12 +795,29 @@ def run(
                     )
                 elif recipe_name == "nvfp4":
                     config = NVFP4DynamicActivationNVFP4WeightConfig(
-                        use_dynamic_per_tensor_scale=False,
+                        use_dynamic_per_tensor_scale=True,
+                    )
+                elif recipe_name == "nvfp4_static":
+                    config_calib = NVFP4DynamicActivationNVFP4WeightConfig(
+                        step="prepare",
+                    )
+                    config = NVFP4DynamicActivationNVFP4WeightConfig(
+                        step="convert",
                     )
                 else:
                     assert False, "unsupported"
 
                 m_fp8_dyn = copy.deepcopy(m_orig)
+
+                if recipe_name == "nvfp4_static":
+                    # calibrate with sample data
+                    # this benchmark is performance-only, so a toy datum is fine
+                    quantize_(m_fp8_dyn, config_calib)
+                    toy_datum = torch.randn(
+                        M_val, K_val, dtype=torch.bfloat16, device="cuda"
+                    )
+                    m_fp8_dyn(toy_datum)
+
                 if op_name == "linear":
                     quantize_(m_fp8_dyn, config)
                 elif op_name == "conv2d":
 
@@ -118,6 +118,7 @@ def run(
         "dim1_mxfp8_floor",
         "dim1_mxfp8_rceil",
         "dim1_mxfp8_triton_floor",
+        "dim1_mxfp8_triton_rceil",
         "dim1_mxfp8_cuda_floor",
         "dim1_mxfp8_cuda_rceil",
     )
@@ -350,12 +351,41 @@ def run(
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
     elif mode == "dim1_mxfp8_triton_floor":
-        y_d1, s_d1 = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
+        y_d1, s_d1 = triton_to_mxfp8_dim1(
+            x, inner_block_size=BLOCK_SIZE, scaling_mode="floor"
+        )
 
         for _ in range(2):
-            __ = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
+            __ = triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="floor"
+            )
         time_us = benchmark_cuda_function_in_microseconds(
-            lambda x, b: triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE),
+            lambda x, b: triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="floor"
+            ),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim1_mxfp8_triton_rceil":
+        y_d1, s_d1 = triton_to_mxfp8_dim1(
+            x, inner_block_size=BLOCK_SIZE, scaling_mode="rceil"
+        )
+
+        for _ in range(2):
+            __ = triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="rceil"
+            )
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="rceil"
+            ),
             x,
             BLOCK_SIZE,
         )
 
@@ -7,15 +7,14 @@
 
 import argparse
 import itertools
-import time
 from dataclasses import dataclass
 from typing import List
 
 import torch
 from tabulate import tabulate
 from tqdm import tqdm
 
-from benchmarks.utils import profile_fn
+from benchmarks.utils import benchmark_cuda_function_in_microseconds, profile_fn
 from torchao.prototype.moe_training.kernels.mxfp8 import (
     _mxfp8_cuda_kernels_available,
     fused_pad_token_groups_cuda,
@@ -73,19 +72,6 @@ def get_configs() -> List[ExperimentConfig]:
     return configs
 
 
-def benchmark_host_side_in_microseconds(fn, *args, num_iters=100, **kwargs):
-    """
-    Benchmark using host-side timing, includes buffer allocation overhead.
-    """
-    torch.cuda.synchronize()
-    start = time.perf_counter()
-    for _ in range(num_iters):
-        fn(*args, **kwargs)
-    torch.cuda.synchronize()
-    end = time.perf_counter()
-    return ((end - start) / num_iters) * 1e6  # Convert to microseconds
-
-
 def run_experiment(
     config: ExperimentConfig, args: argparse.Namespace
 ) -> ExperimentResult:
@@ -102,15 +88,19 @@ def torch_eager_with_offsets():
         group_offsets = generate_jagged_offs(
             num_groups, num_tokens, multiple_of=1, device=device
         )
-        return torch_pad_token_groups(inputs, group_offsets, alignment_size)
+        return torch_pad_token_groups(
+            inputs, group_offsets, alignment_size
+        )  # Returns 3 values
 
     def warmup(fn):
         for _ in range(5):
             fn()
 
     # bench torch eager (includes buffer allocation overhead)
     warmup(torch_eager_with_offsets)
-    torch_eager_time_us = benchmark_host_side_in_microseconds(torch_eager_with_offsets)
+    torch_eager_time_us = benchmark_cuda_function_in_microseconds(
+        torch_eager_with_offsets
+    )
     if args.profile:
         group_offsets = generate_jagged_offs(
             num_groups, num_tokens, multiple_of=1, device=device
@@ -133,7 +123,7 @@ def cuda_with_offsets():
             return fused_pad_token_groups_cuda(inputs, group_offsets, alignment_size)
 
         warmup(cuda_with_offsets)
-        cuda_time_us = benchmark_host_side_in_microseconds(cuda_with_offsets)
+        cuda_time_us = benchmark_cuda_function_in_microseconds(cuda_with_offsets)
         if args.profile:
             group_offsets = generate_jagged_offs(
                 num_groups, num_tokens, multiple_of=1, device=device
@@ -152,8 +142,8 @@ def cuda_with_offsets():
     group_offsets = generate_jagged_offs(
         num_groups, num_tokens, multiple_of=1, device=device
     )
-    torch_padded_tokens, torch_padded_offsets = torch_pad_token_groups(
-        inputs, group_offsets, alignment_size
+    torch_padded_tokens, torch_padded_start_offsets, torch_padded_offsets = (
+        torch_pad_token_groups(inputs, group_offsets, alignment_size)
     )
 
     bytes_per_el = torch.finfo(torch.bfloat16).bits / 8