add pdl for all

yzh119 · yzh119 · commit fcd5c5d4d93e · 2026-02-03T08:26:35.000-05:00
diff --git a/flashinfer/norm/kernels/fused_add_rmsnorm.py b/flashinfer/norm/kernels/fused_add_rmsnorm.py
@@ -95,6 +95,7 @@ def __call__(
         mW: cute.Tensor,
         M: Int32,
         eps: Float32,
+        enable_pdl: cutlass.Constexpr[bool],
         stream,
     ):
         tv_shape, tv_stride = make_tv_layout(
@@ -105,11 +106,12 @@ def __call__(
         tv_layout = cute.make_layout(tv_shape, stride=tv_stride)
         tiler_mn = (1, self.cols_per_tile)
 
-        self.kernel(mX, mR, mW, M, eps, tv_layout, tiler_mn).launch(
+        self.kernel(mX, mR, mW, M, eps, enable_pdl, tv_layout, tiler_mn).launch(
             grid=[M, 1, 1],
             block=[self.num_threads, 1, 1],
             smem=self._smem_size_in_bytes(),
             stream=stream,
+            use_pdl=enable_pdl,
         )
 
     @cute.kernel
@@ -120,12 +122,17 @@ def kernel(
         mW: cute.Tensor,
         M: Int32,
         eps: Float32,
+        enable_pdl: cutlass.Constexpr[bool],
         tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
 
+        # PDL: Wait for previous kernel (SM90+ only)
+        if enable_pdl:
+            cute.arch.griddepcontrol_wait()
+
         H = self.H
         weight_bias = self.weight_bias
         threads_per_row = tv_layout.shape[0][0]
@@ -210,6 +217,10 @@ def kernel(
 
         cute.copy(copy_atom, tXrY, tYgX, pred=tXpX)
 
+        # PDL: Signal dependent kernels (SM90+ only)
+        if enable_pdl:
+            cute.arch.griddepcontrol_launch_dependents()
+
 
 # =============================================================================
 # FusedAddRMSNormQuantKernel
@@ -264,6 +275,7 @@ def __call__(
         M: Int32,
         scale: Float32,
         eps: Float32,
+        enable_pdl: cutlass.Constexpr[bool],
         stream,
     ):
         tv_shape, tv_stride = make_tv_layout(
@@ -274,11 +286,14 @@ def __call__(
         tv_layout = cute.make_layout(tv_shape, stride=tv_stride)
         tiler_mn = (1, self.cols_per_tile)
 
-        self.kernel(mY, mX, mR, mW, M, scale, eps, tv_layout, tiler_mn).launch(
+        self.kernel(
+            mY, mX, mR, mW, M, scale, eps, enable_pdl, tv_layout, tiler_mn
+        ).launch(
             grid=[M, 1, 1],
             block=[self.num_threads, 1, 1],
             smem=self._smem_size_in_bytes(),
             stream=stream,
+            use_pdl=enable_pdl,
         )
 
     @cute.kernel
@@ -291,12 +306,17 @@ def kernel(
         M: Int32,
         scale: Float32,
         eps: Float32,
+        enable_pdl: cutlass.Constexpr[bool],
         tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
 
+        # PDL: Wait for previous kernel (SM90+ only)
+        if enable_pdl:
+            cute.arch.griddepcontrol_wait()
+
         H = self.H
         weight_bias = self.weight_bias
         threads_per_row = tv_layout.shape[0][0]
@@ -396,14 +416,20 @@ def kernel(
                     out_ptr = get_ptr_as_int64(mY, Int32(out_offset))
                     cvt_and_store_f32_to_e4m3(clamped, out_ptr)
 
+        # PDL: Signal dependent kernels (SM90+ only)
+        if enable_pdl:
+            cute.arch.griddepcontrol_launch_dependents()
+
 
 # =============================================================================
 # Compiled Kernel Getters
 # =============================================================================
 
 
 @functools.cache
-def _get_compiled_fused_add_rmsnorm_kernel(dtype_str: str, H: int, weight_bias: float):
+def _get_compiled_fused_add_rmsnorm_kernel(
+    dtype_str: str, H: int, weight_bias: float, enable_pdl: bool
+):
     """Get a compiled Fused Add + RMSNorm kernel using TVM-FFI."""
     dtype = get_cutlass_dtype(dtype_str)
     kernel_obj = FusedAddRMSNormKernel(dtype, H, weight_bias)
@@ -429,6 +455,7 @@ def _get_compiled_fused_add_rmsnorm_kernel(dtype_str: str, H: int, weight_bias:
         w_fake,
         Int32(1),
         Float32(1e-6),
+        enable_pdl,
         stream_fake,
         options="--enable-tvm-ffi",
     )
@@ -453,7 +480,7 @@ def tensor_api(
 
 @functools.cache
 def _get_compiled_fused_add_rmsnorm_quant_kernel(
-    dtype_str: str, out_dtype_str: str, H: int, weight_bias: float
+    dtype_str: str, out_dtype_str: str, H: int, weight_bias: float, enable_pdl: bool
 ):
     """Get a compiled Fused Add + RMSNorm + Quant kernel using TVM-FFI."""
     dtype = get_cutlass_dtype(dtype_str)
@@ -487,6 +514,7 @@ def _get_compiled_fused_add_rmsnorm_quant_kernel(
         Int32(1),
         Float32(1.0),  # scale
         Float32(1e-6),
+        enable_pdl,
         stream_fake,
         options="--enable-tvm-ffi",
     )
@@ -536,7 +564,9 @@ def fused_add_rmsnorm_cute(
     M = input.shape[0]
 
     dtype_str = _torch_dtype_to_str(input.dtype)
-    kernel = _get_compiled_fused_add_rmsnorm_kernel(dtype_str, H, weight_bias)
+    kernel = _get_compiled_fused_add_rmsnorm_kernel(
+        dtype_str, H, weight_bias, enable_pdl
+    )
     kernel(input, residual, weight, M, eps)
 
 
@@ -562,7 +592,7 @@ def fused_add_rmsnorm_quant_cute(
     dtype_str = _torch_dtype_to_str(input.dtype)
     out_dtype_str = _torch_dtype_to_str(out.dtype)
     kernel = _get_compiled_fused_add_rmsnorm_quant_kernel(
-        dtype_str, out_dtype_str, H, weight_bias
+        dtype_str, out_dtype_str, H, weight_bias, enable_pdl
     )
     kernel(
         out,
diff --git a/flashinfer/norm/kernels/layernorm.py b/flashinfer/norm/kernels/layernorm.py
@@ -108,6 +108,7 @@ def __call__(
         mBeta: cute.Tensor,
         M: Int32,
         eps: Float32,
+        enable_pdl: cutlass.Constexpr[bool],
         stream,
     ):
         # Layout for input (float16/bfloat16)
@@ -135,6 +136,7 @@ def __call__(
             mBeta,
             M,
             eps,
+            enable_pdl,
             tv_layout,
             tiler_mn,
             tv_layout_f32,
@@ -144,6 +146,7 @@ def __call__(
             block=[self.num_threads, 1, 1],
             smem=self._smem_size_in_bytes(),
             stream=stream,
+            use_pdl=enable_pdl,
         )
 
     @cute.kernel
@@ -155,6 +158,7 @@ def kernel(
         mBeta: cute.Tensor,
         M: Int32,
         eps: Float32,
+        enable_pdl: cutlass.Constexpr[bool],
         tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
         tv_layout_f32: cute.Layout,
@@ -163,6 +167,10 @@ def kernel(
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
 
+        # PDL: Wait for previous kernel (SM90+ only)
+        if enable_pdl:
+            cute.arch.griddepcontrol_wait()
+
         H = self.H
         threads_per_row = tv_layout.shape[0][0]
         num_warps = self.num_warps
@@ -343,14 +351,20 @@ def kernel(
 
         cute.copy(copy_atom_load, tXrY, tXgY, pred=tXpX)
 
+        # PDL: Signal dependent kernels (SM90+ only)
+        if enable_pdl:
+            cute.arch.griddepcontrol_launch_dependents()
+
 
 # =============================================================================
 # Compiled Kernel Getter
 # =============================================================================
 
 
 @functools.cache
-def _get_compiled_layernorm_kernel(dtype_str: str, gamma_dtype_str: str, H: int):
+def _get_compiled_layernorm_kernel(
+    dtype_str: str, gamma_dtype_str: str, H: int, enable_pdl: bool
+):
     """Get a compiled LayerNorm kernel using TVM-FFI."""
     dtype = get_cutlass_dtype(dtype_str)
     gamma_dtype = get_cutlass_dtype(gamma_dtype_str)
@@ -383,6 +397,7 @@ def _get_compiled_layernorm_kernel(dtype_str: str, gamma_dtype_str: str, H: int)
         beta_fake,
         Int32(1),
         Float32(1e-6),
+        enable_pdl,
         stream_fake,
         options="--enable-tvm-ffi",
     )
@@ -418,6 +433,7 @@ def layernorm_cute(
     gamma: torch.Tensor,
     beta: torch.Tensor,
     eps: float = 1e-6,
+    enable_pdl: bool = False,
 ) -> None:
     """CuTe DSL LayerNorm implementation.
 
@@ -430,7 +446,7 @@ def layernorm_cute(
 
     dtype_str = _torch_dtype_to_str(input.dtype)
     gamma_dtype_str = _torch_dtype_to_str(gamma.dtype)
-    kernel = _get_compiled_layernorm_kernel(dtype_str, gamma_dtype_str, H)
+    kernel = _get_compiled_layernorm_kernel(dtype_str, gamma_dtype_str, H, enable_pdl)
     kernel(out, input, gamma, beta, M, eps)
 
 
diff --git a/flashinfer/norm/kernels/rmsnorm.py b/flashinfer/norm/kernels/rmsnorm.py