Rename fused_qk_norm_rope to fused_qk_rmsnorm_rope

kahyunnam · kahyunnam · commit 17103dd7d25e · 2026-04-22T21:06:16.000Z
The kernel performs RMSNorm specifically (not LayerNorm or generic norm).
Rename to fused_qk_rmsnorm_rope for consistency with FlashInfer's
existing naming convention (rmsnorm, fused_add_rmsnorm, gemma_rmsnorm,
fused_rmsnorm_silu).

All files, imports, symbols, and docstrings updated. Internal kernel
function names (fusedQKNormRopeKernel, launchFusedQKNormRope) kept
as-is since they are not part of the public API.

25 passed, 1 xfail after rename.

AI-assisted.

Made-with: Cursor
diff --git a/benchmarks/bench_fused_qk_rmsnorm_rope.py b/benchmarks/bench_fused_qk_rmsnorm_rope.py
@@ -1,13 +1,13 @@
 """
-Benchmark for fused QKNorm + 3D RoPE kernel vs eager PyTorch baseline.
+Benchmark for fused QK RMSNorm + 3D RoPE kernel vs eager PyTorch baseline.
 
 Measures performance across WAN model shapes and compares:
 - Eager: separate nn.RMSNorm + manual interleaved RoPE in PyTorch
-- Fused: flashinfer.diffusion_ops.fused_qk_norm_rope (single kernel)
+- Fused: flashinfer.diffusion_ops.fused_qk_rmsnorm_rope (single kernel)
 
 Usage:
-    python benchmarks/bench_fused_qk_norm_rope.py
-    python benchmarks/bench_fused_qk_norm_rope.py --gpu 2   # run on specific GPU
+    python benchmarks/bench_fused_qk_rmsnorm_rope.py
+    python benchmarks/bench_fused_qk_rmsnorm_rope.py --gpu 2   # run on specific GPU
 """
 
 import argparse
@@ -17,7 +17,7 @@
 import torch.nn as nn
 
 from flashinfer.testing.utils import bench_gpu_time
-from flashinfer.diffusion_ops import fused_qk_norm_rope
+from flashinfer.diffusion_ops import fused_qk_rmsnorm_rope
 
 
 def compute_rope_dims(head_dim):
@@ -133,7 +133,7 @@ def eager_fn():
         return q_out, k_out, v_heads
 
     def fused_fn():
-        return fused_qk_norm_rope(
+        return fused_qk_rmsnorm_rope(
             qkv_combined,
             q_weight,
             k_weight,
@@ -166,7 +166,7 @@ def fused_fn():
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Benchmark fused QKNorm + 3D RoPE")
+    parser = argparse.ArgumentParser(description="Benchmark fused QK RMSNorm + 3D RoPE")
     parser.add_argument("--gpu", type=int, default=0, help="GPU device index")
     args = parser.parse_args()
 
diff --git a/csrc/flashinfer_norm_binding.cu b/csrc/flashinfer_norm_binding.cu
@@ -34,7 +34,7 @@ void gemma_fused_add_rmsnorm(TensorView input, TensorView residual, TensorView w
 
 void layernorm(Tensor out, Tensor input, Tensor gamma, Tensor beta, double eps);
 
-void fused_qk_norm_rope_run(TensorView qkv_in, TensorView q_weight, TensorView k_weight,
+void fused_qk_rmsnorm_rope_run(TensorView qkv_in, TensorView q_weight, TensorView k_weight,
                             TensorView q_out, TensorView k_out, TensorView v_out,
                             int64_t num_tokens, int64_t seq_len, int64_t ppf, int64_t pph,
                             int64_t ppw, int64_t num_frame_channels, int64_t num_height_channels,
@@ -51,4 +51,4 @@ TVM_FFI_DLL_EXPORT_TYPED_FUNC(fused_add_rmsnorm_quant, fused_add_rmsnorm_quant);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(gemma_rmsnorm, gemma_rmsnorm);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(gemma_fused_add_rmsnorm, gemma_fused_add_rmsnorm);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(layernorm, layernorm);
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(fused_qk_norm_rope, fused_qk_norm_rope_run);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(fused_qk_rmsnorm_rope, fused_qk_rmsnorm_rope_run);
diff --git a/csrc/norm.cu b/csrc/norm.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <flashinfer/fused_qk_norm_rope.cuh>
+#include <flashinfer/fused_qk_rmsnorm_rope.cuh>
 #include <flashinfer/norm.cuh>
 
 #include "tvm_ffi_utils.h"
@@ -273,7 +273,7 @@ void layernorm(Tensor output, Tensor input, Tensor gamma, Tensor beta, double ep
   });
 }
 
-void fused_qk_norm_rope_run(TensorView qkv_in, TensorView q_weight, TensorView k_weight,
+void fused_qk_rmsnorm_rope_run(TensorView qkv_in, TensorView q_weight, TensorView k_weight,
                             TensorView q_out, TensorView k_out, TensorView v_out,
                             int64_t num_tokens, int64_t seq_len, int64_t ppf, int64_t pph,
                             int64_t ppw, int64_t num_frame_channels, int64_t num_height_channels,
diff --git a/flashinfer/diffusion_ops/__init__.py b/flashinfer/diffusion_ops/__init__.py
@@ -1,5 +1,5 @@
-from flashinfer.norm import fused_qk_norm_rope
+from flashinfer.norm import fused_qk_rmsnorm_rope
 
 __all__ = [
-    "fused_qk_norm_rope",
+    "fused_qk_rmsnorm_rope",
 ]
diff --git a/flashinfer/norm/__init__.py b/flashinfer/norm/__init__.py
@@ -761,7 +761,7 @@ def fused_rmsnorm_silu(
     return out
 
 
-from .fused_qk_norm_rope import fused_qk_norm_rope as fused_qk_norm_rope
+from .fused_qk_rmsnorm_rope import fused_qk_rmsnorm_rope as fused_qk_rmsnorm_rope
 
 # Public API exports
 __all__ = [
@@ -776,5 +776,5 @@ def fused_rmsnorm_silu(
     "gemma_fused_add_rmsnorm",
     "layernorm",
     "fused_rmsnorm_silu",
-    "fused_qk_norm_rope",
+    "fused_qk_rmsnorm_rope",
 ]
diff --git a/flashinfer/norm/fused_qk_rmsnorm_rope.py b/flashinfer/norm/fused_qk_rmsnorm_rope.py
@@ -13,7 +13,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 
-Fused QKNorm + 3D RoPE for Video Generation DIT Self-Attention
+Fused QK RMSNorm + 3D RoPE for Video Generation DIT Self-Attention
 ===============================================================
 
 Fuses across-heads RMSNorm on Q and K, 3D rotary position embeddings
@@ -34,13 +34,13 @@
 
 
 @supported_compute_capability([80, 86, 89, 90, 100, 103, 110, 120, 121])
-def _check_fused_qk_norm_rope(
+def _check_fused_qk_rmsnorm_rope(
     qkv,
     q_weight,
     k_weight,
     **kwargs,
 ):
-    """Validate inputs for fused QKNorm + 3D RoPE.
+    """Validate inputs for fused QK RMSNorm + 3D RoPE.
 
     Architecture notes:
     - SM80+ (Ampere): Full support for BF16 path; FP8 output uses software emulation
@@ -119,8 +119,8 @@ def _check_fused_qk_norm_rope(
 
 
 @flashinfer_api
-@backend_requirement(backend_checks={}, common_check=_check_fused_qk_norm_rope)
-def fused_qk_norm_rope(
+@backend_requirement(backend_checks={}, common_check=_check_fused_qk_rmsnorm_rope)
+def fused_qk_rmsnorm_rope(
     qkv: torch.Tensor,
     q_weight: torch.Tensor,
     k_weight: torch.Tensor,
@@ -150,7 +150,7 @@ def fused_qk_norm_rope(
     k_out: Optional[torch.Tensor] = None,
     v_out: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    r"""Fused QKNorm + 3D RoPE + V copy for video generation DIT self-attention.
+    r"""Fused QK RMSNorm + 3D RoPE + V copy for video generation DIT self-attention.
 
     Applies across-heads RMSNorm to Q and K, then rotary position embeddings
     with 3D spatial decomposition (frame/height/width), and copies V to a
@@ -255,7 +255,7 @@ def fused_qk_norm_rope(
     k_out_flat = k_out.view(num_tokens, -1)
     v_out_flat = v_out.view(num_tokens, -1)
 
-    get_norm_module().fused_qk_norm_rope(
+    get_norm_module().fused_qk_rmsnorm_rope(
         qkv_flat,
         q_weight,
         k_weight,
diff --git a/include/flashinfer/fused_qk_rmsnorm_rope.cuh b/include/flashinfer/fused_qk_rmsnorm_rope.cuh
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef FLASHINFER_FUSED_QK_NORM_ROPE_CUH_
-#define FLASHINFER_FUSED_QK_NORM_ROPE_CUH_
+#ifndef FLASHINFER_FUSED_QK_RMSNORM_ROPE_CUH_
+#define FLASHINFER_FUSED_QK_RMSNORM_ROPE_CUH_
 
 #include <cuda_bf16.h>
 #include <cuda_fp8.h>
@@ -303,7 +303,7 @@ __device__ __forceinline__ void quantize_store_fp8(float2 const* elements, __nv_
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// Section 6: Fused QKNorm + RoPE kernel
+// Section 6: Fused QK RMSNorm + RoPE kernel
 //
 // Performs across-heads RMSNorm and 3D RoPE in a single kernel (for self-attention).
 // Also copies V to a separate contiguous output buffer with optional FP8 quantization.
@@ -751,4 +751,4 @@ inline void launchFusedQKNormRope(void const* qkv_in, void* q_out, void* k_out,
 
 }  // namespace flashinfer
 
-#endif  // FLASHINFER_FUSED_QK_NORM_ROPE_CUH_
+#endif  // FLASHINFER_FUSED_QK_RMSNORM_ROPE_CUH_
diff --git a/tests/norm/test_fused_qk_rmsnorm_rope.py b/tests/norm/test_fused_qk_rmsnorm_rope.py
@@ -1,5 +1,5 @@
 """
-Tests for fused QKNorm + 3D RoPE kernel.
+Tests for fused QK RMSNorm + 3D RoPE kernel.
 
 Tests correctness against a PyTorch reference implementation that matches
 the WAN 2.2 model.py:
@@ -15,7 +15,7 @@
 import torch
 import torch.nn as nn
 
-from flashinfer.diffusion_ops import fused_qk_norm_rope
+from flashinfer.diffusion_ops import fused_qk_rmsnorm_rope
 
 
 # ---------------------------------------------------------------------------
@@ -322,7 +322,7 @@ def test_interleaved_correctness(batch_size, ppf, pph, ppw):
     )
 
     qkv_combined = torch.cat([query, key, value], dim=-1).contiguous()
-    q_fused, k_fused, v_fused = fused_qk_norm_rope(
+    q_fused, k_fused, v_fused = fused_qk_rmsnorm_rope(
         qkv_combined,
         norm_q.weight.contiguous(),
         norm_k.weight.contiguous(),
@@ -401,7 +401,7 @@ def test_neox_correctness(batch_size, ppf, pph, ppw):
     )
 
     qkv_combined = torch.cat([query, key, value], dim=-1).contiguous()
-    q_fused, k_fused, v_fused = fused_qk_norm_rope(
+    q_fused, k_fused, v_fused = fused_qk_rmsnorm_rope(
         qkv_combined,
         norm_q.weight.contiguous(),
         norm_k.weight.contiguous(),
@@ -458,7 +458,7 @@ def test_v_passthrough():
     q_weight = torch.ones(hidden_dim, device=device, dtype=dtype)
     k_weight = torch.ones(hidden_dim, device=device, dtype=dtype)
 
-    _, _, v_fused = fused_qk_norm_rope(
+    _, _, v_fused = fused_qk_rmsnorm_rope(
         qkv_combined,
         q_weight,
         k_weight,
@@ -509,7 +509,7 @@ def test_destination_passing():
         batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype
     )
 
-    q_ret, k_ret, v_ret = fused_qk_norm_rope(
+    q_ret, k_ret, v_ret = fused_qk_rmsnorm_rope(
         qkv,
         torch.ones(hidden_dim, device=device, dtype=dtype),
         torch.ones(hidden_dim, device=device, dtype=dtype),
@@ -581,8 +581,8 @@ def test_2d_input():
     q_weight = torch.ones(hidden_dim, device=device, dtype=dtype)
     k_weight = torch.ones(hidden_dim, device=device, dtype=dtype)
 
-    q_3d, k_3d, v_3d = fused_qk_norm_rope(qkv_3d, q_weight, k_weight, **kwargs)
-    q_2d, k_2d, v_2d = fused_qk_norm_rope(qkv_2d, q_weight, k_weight, **kwargs)
+    q_3d, k_3d, v_3d = fused_qk_rmsnorm_rope(qkv_3d, q_weight, k_weight, **kwargs)
+    q_2d, k_2d, v_2d = fused_qk_rmsnorm_rope(qkv_2d, q_weight, k_weight, **kwargs)
 
     assert q_3d.ndim == 4, f"3D input should give 4D output, got {q_3d.ndim}D"
     assert q_2d.ndim == 3, f"2D input should give 3D output, got {q_2d.ndim}D"
@@ -626,7 +626,7 @@ def test_fp8_output(output_scale):
 
     qkv_combined = torch.cat([query, key, value], dim=-1).contiguous()
 
-    q_fp8, k_fp8, v_fp8 = fused_qk_norm_rope(
+    q_fp8, k_fp8, v_fp8 = fused_qk_rmsnorm_rope(
         qkv_combined,
         norm_q.weight.contiguous(),
         norm_k.weight.contiguous(),
@@ -713,7 +713,7 @@ def test_rope_only_no_norm():
     q_weight = torch.ones(hidden_dim, device=device, dtype=dtype)
     k_weight = torch.ones(hidden_dim, device=device, dtype=dtype)
 
-    q_fused, k_fused, _ = fused_qk_norm_rope(
+    q_fused, k_fused, _ = fused_qk_rmsnorm_rope(
         qkv_combined,
         q_weight,
         k_weight,
@@ -811,7 +811,7 @@ def test_multi_config(config_name):
     )
 
     qkv_combined = torch.cat([query, key, value], dim=-1).contiguous()
-    q_fused, k_fused, _ = fused_qk_norm_rope(
+    q_fused, k_fused, _ = fused_qk_rmsnorm_rope(
         qkv_combined,
         norm_q.weight.contiguous(),
         norm_k.weight.contiguous(),
@@ -847,7 +847,7 @@ def test_error_non_cuda():
     qkv = torch.randn(1, 120, 3 * 3072, dtype=torch.bfloat16)
     w = torch.ones(3072, dtype=torch.bfloat16)
     with pytest.raises((ValueError, RuntimeError)):
-        fused_qk_norm_rope(
+        fused_qk_rmsnorm_rope(
             qkv,
             w,
             w,
@@ -869,7 +869,7 @@ def test_error_wrong_dtype():
     qkv = torch.randn(1, 120, 3 * 3072, dtype=torch.float16, device=device)
     w = torch.ones(3072, dtype=torch.bfloat16, device=device)
     with pytest.raises((ValueError, RuntimeError)):
-        fused_qk_norm_rope(
+        fused_qk_rmsnorm_rope(
             qkv,
             w,
             w,
@@ -893,7 +893,7 @@ def test_error_bad_head_dim():
     qkv = torch.randn(1, 120, 3 * hidden, dtype=torch.bfloat16, device=device)
     w = torch.ones(hidden, dtype=torch.bfloat16, device=device)
     with pytest.raises((ValueError, RuntimeError)):
-        fused_qk_norm_rope(
+        fused_qk_rmsnorm_rope(
             qkv,
             w,
             w,
@@ -915,7 +915,7 @@ def test_error_channel_sum_mismatch():
     qkv = torch.randn(1, 120, 3 * 3072, dtype=torch.bfloat16, device=device)
     w = torch.ones(3072, dtype=torch.bfloat16, device=device)
     with pytest.raises((ValueError, RuntimeError)):
-        fused_qk_norm_rope(
+        fused_qk_rmsnorm_rope(
             qkv,
             w,
             w,
@@ -937,7 +937,7 @@ def test_error_seq_len_mismatch():
     qkv = torch.randn(1, 100, 3 * 3072, dtype=torch.bfloat16, device=device)
     w = torch.ones(3072, dtype=torch.bfloat16, device=device)
     with pytest.raises((ValueError, RuntimeError)):
-        fused_qk_norm_rope(
+        fused_qk_rmsnorm_rope(
             qkv,
             w,
             w,

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`		`-from flashinfer.norm import fused_qk_norm_rope`
	`1`	`+from flashinfer.norm import fused_qk_rmsnorm_rope`
`2`	`2`
`3`	`3`	`__all__ = [`
`4`		`- "fused_qk_norm_rope",`
	`4`	`+ "fused_qk_rmsnorm_rope",`
`5`	`5`	`]`