Triton based Gather/Scatter kernels runs on valid tokens.

levendlee · facebook-github-bot · commit 492e5dfa7d7a · 2025-05-23T14:16:01.000-07:00
Summary: Triton based Gather/Scatter kernels runs on valid tokens.

Differential Revision: D75320859
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/moe/gather_scatter.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/moe/gather_scatter.py
@@ -21,6 +21,7 @@ def gather_scale_dense_tokens(
     token_indices: torch.Tensor,
     expert_indices: torch.Tensor,
     scores: torch.Tensor,
+    valid_token_count: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     T, D = x.shape
     E = scores.shape[1]
@@ -58,6 +59,7 @@ def gather_scale_dense_tokens(
         scores,
         stride_t,
         stride_e,
+        valid_token_count,
         D,  # pyre-ignore
         BLOCK_D_OUTER,  # pyre-ignore
         BLOCK_D_INNER,  # pyre-ignore
@@ -71,6 +73,7 @@ def gather_scale_quant_dense_tokens(
     expert_indices: torch.Tensor,
     scores: torch.Tensor,
     scale_ub: Optional[torch.Tensor] = None,
+    valid_token_count: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     T, D = x.shape
     E = scores.shape[1]
@@ -104,6 +107,7 @@ def gather_scale_quant_dense_tokens(
         scale_ub,
         stride_t,
         stride_e,
+        valid_token_count,
         D,
         TL_FP8_DTYPE=tl_dtype,
         MAX_FP8=max_fp8,
@@ -117,6 +121,7 @@ def scatter_add_dense_tokens(
     out_tokens: torch.Tensor,  # [T, D]
     in_tokens: torch.Tensor,  # [a, D]
     token_indices: torch.Tensor,  # [a]
+    valid_token_count: Optional[torch.Tensor] = None,
 ) -> None:
     assert torch.version.hip is not None or (
         torch.version.cuda is not None and torch.version.cuda >= "12.4"
@@ -144,6 +149,7 @@ def scatter_add_dense_tokens(
         out_tokens,
         in_tokens,
         token_indices,
+        valid_token_count,
         D,  # pyre-ignore
         BLOCK_D_OUTER,  # pyre-ignore
         BLOCK_D_INNER,  # pyre-ignore
@@ -206,6 +212,7 @@ def gather_scale_dense_tokens_meta(
     token_indices,
     expert_indices,
     scores,
+    valid_token_count=None,
 ):
     D = x.shape[1]
     a = token_indices.shape[0]
@@ -218,12 +225,14 @@ def gather_scale_dense_tokens_cuda(
     token_indices,
     expert_indices,
     scores,
+    valid_token_count=None,
 ):
     return gather_scale_dense_tokens(
         x,
         token_indices,
         expert_indices,
         scores,
+        valid_token_count,
     )
 
 
@@ -241,7 +250,8 @@ def gather_scale_quant_dense_tokens_meta(
     token_indices,
     expert_indices,
     scores,
-    scale_ub,
+    scale_ub=None,
+    valid_token_count=None,
 ):
     D = x.shape[1]
     a = token_indices.shape[0]
@@ -258,13 +268,15 @@ def gather_scale_quant_dense_tokens_cuda(
     expert_indices,
     scores,
     scale_ub=None,
+    valid_token_count=None,
 ):
     return gather_scale_quant_dense_tokens(
         x,
         token_indices,
         expert_indices,
         scores,
         scale_ub,
+        valid_token_count,
     )
 
 
@@ -281,6 +293,7 @@ def scatter_add_dense_tokens_meta(
     out_tokens,
     in_tokens,
     token_indices,
+    valid_token_count=None,
 ):
     return None
 
@@ -290,8 +303,11 @@ def scatter_add_dense_tokens_cuda(
     out_tokens,
     in_tokens,
     token_indices,
+    valid_token_count=None,
 ):
-    return scatter_add_dense_tokens(out_tokens, in_tokens, token_indices)
+    return scatter_add_dense_tokens(
+        out_tokens, in_tokens, token_indices, valid_token_count
+    )
 
 
 _SCATTER_ADD_PADDED_TOKENS_OP_NAME = "fbgemm::scatter_add_padded_tokens"
@@ -337,13 +353,21 @@ def _fbgemm_gather_scale_dense_tokens(
     scores,
     stride_t,
     stride_e,
+    valid_token_count,
     D: tl.constexpr,
     BLOCK_D_OUTER: tl.constexpr,
     BLOCK_D_INNER: tl.constexpr,
 ):
     output_token_index = tl.program_id(0)
     feature_offset = tl.program_id(1) * BLOCK_D_OUTER
 
+    if valid_token_count is not None:
+        valid_token_count = tl.load(
+            valid_token_count, None, eviction_policy="evict_last"
+        )
+        if output_token_index >= valid_token_count:
+            return
+
     input_token_index = tl.load(
         token_indices + output_token_index, None, eviction_policy="evict_last"
     )
@@ -383,13 +407,21 @@ def _fbgemm_scatter_add_dense_tokens(
     out_tokens,
     in_tokens,
     token_indices,
+    valid_token_count,
     D: tl.constexpr,
     BLOCK_D_OUTER: tl.constexpr,
     BLOCK_D_INNER: tl.constexpr,
 ):
     input_token_index = tl.program_id(0).to(tl.int64)
     feature_offset = tl.program_id(1) * BLOCK_D_OUTER + tl.arange(0, BLOCK_D_INNER)[:]
 
+    if valid_token_count is not None:
+        valid_token_count = tl.load(
+            valid_token_count, None, eviction_policy="evict_last"
+        )
+        if input_token_index >= valid_token_count:
+            return
+
     output_token_index = tl.load(
         token_indices + input_token_index, None, eviction_policy="evict_last"
     ).to(tl.int64)
@@ -429,6 +461,7 @@ def _fbgemm_gather_scale_fp8_rowwise_quant_dense_tokens(
     scale_ub_ptr,
     stride_t,
     stride_e,
+    valid_token_count,
     D: tl.constexpr,
     TL_FP8_DTYPE: tl.constexpr,
     MAX_FP8: tl.constexpr,
@@ -440,6 +473,13 @@ def _fbgemm_gather_scale_fp8_rowwise_quant_dense_tokens(
 
     output_token_index = tl.program_id(0)
 
+    if valid_token_count is not None:
+        valid_token_count = tl.load(
+            valid_token_count, None, eviction_policy="evict_last"
+        )
+        if output_token_index >= valid_token_count:
+            return
+
     input_token_index = tl.load(
         token_indices_ptr + output_token_index, None, eviction_policy="evict_first"
     )
diff --git a/fbgemm_gpu/experimental/gen_ai/test/moe/gather_scatter_test.py b/fbgemm_gpu/experimental/gen_ai/test/moe/gather_scatter_test.py
@@ -9,7 +9,7 @@
 
 import logging
 import unittest
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 import triton  # noqa: F401
@@ -28,11 +28,9 @@
 from hypothesis import given, settings, strategies as st, Verbosity
 
 try:
-    # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu import open_source
 
-    # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu.docs.version import __version__  # noqa: F401
 except Exception:
@@ -58,6 +56,7 @@ class GatherScatterTests(unittest.TestCase):
         E=st.sampled_from([2, 4, 8]),
         T=st.sampled_from([1, 128, 2048, 4096, 16384]),
         D=st.sampled_from([5120, 7168]),
+        partial=st.sampled_from([True, False]),
         rowmajor=st.sampled_from([True, False]),
         compiled=st.sampled_from([True, False]),
     )
@@ -67,6 +66,7 @@ def test_gather_scale_dense_tokens(
         E: int,
         T: int,
         D: int,
+        partial: bool,
         rowmajor: bool,
         compiled: bool,
     ) -> None:
@@ -78,6 +78,22 @@ def test_gather_scale_dense_tokens(
         token_indices: torch.Tensor = torch.randperm(T, device="cuda").to(torch.int32)
         scores: torch.Tensor = torch.rand((E, T), dtype=torch.bfloat16, device="cuda")
 
+        num_valid_tokens: int = T
+        valid_token_count: Optional[torch.Tensor] = None
+        partial_expert_indices: torch.Tensor = expert_indices
+        partial_token_indices: torch.Tensor = token_indices
+        if partial:
+            num_valid_tokens = T // 2
+            valid_token_count = torch.tensor(
+                [num_valid_tokens], dtype=torch.int32, device="cuda"
+            )
+            partial_expert_indices = torch.where(
+                torch.arange(T).cuda() < num_valid_tokens, expert_indices, -1
+            )
+            partial_token_indices = torch.where(
+                torch.arange(T).cuda() < num_valid_tokens, token_indices, -1
+            )
+
         def torch_fn() -> torch.Tensor:
             shuffled_x = torch.index_select(x, dim=0, index=token_indices)
             shuffled_scores = torch.index_select(scores, dim=1, index=token_indices)
@@ -96,17 +112,26 @@ def triton_fn() -> torch.Tensor:
             op = gather_scale_dense_tokens
             if compiled:
                 op = torch.compile(op)
-            test_output = op(x, token_indices, expert_indices, scores_)
+            test_output = op(
+                x,
+                partial_token_indices,
+                partial_expert_indices,
+                scores_,
+                valid_token_count,
+            )
             return test_output
 
         test_output = triton_fn()
 
-        torch.testing.assert_close(torch_output, test_output)
+        torch.testing.assert_close(
+            torch_output[:num_valid_tokens], test_output[:num_valid_tokens]
+        )
 
     @given(
         E=st.sampled_from([2, 4, 8]),
         T=st.sampled_from([1, 128, 2048, 4096, 16384]),
         D=st.sampled_from([5120, 7168]),
+        partial=st.sampled_from([True, False]),
         rowmajor=st.sampled_from([True, False]),
         compiled=st.sampled_from([True, False]),
     )
@@ -116,6 +141,7 @@ def test_gather_scale_quant_dense_tokens(
         E: int,
         T: int,
         D: int,
+        partial: bool,
         rowmajor: bool,
         compiled: bool,
     ) -> None:
@@ -126,9 +152,24 @@ def test_gather_scale_quant_dense_tokens(
         expert_indices: torch.Tensor = torch.randint(0, E, (T,), device="cuda")
         token_indices: torch.Tensor = torch.randperm(T, device="cuda").to(torch.int32)
         scores: torch.Tensor = torch.randn((E, T), dtype=torch.bfloat16, device="cuda")
-
         scale_ub = torch.tensor([1200], dtype=torch.float, device="cuda")
 
+        num_valid_tokens: int = T
+        valid_token_count: Optional[torch.Tensor] = None
+        partial_expert_indices: torch.Tensor = expert_indices
+        partial_token_indices: torch.Tensor = token_indices
+        if partial:
+            num_valid_tokens = T // 2
+            valid_token_count = torch.tensor(
+                [num_valid_tokens], dtype=torch.int32, device="cuda"
+            )
+            partial_expert_indices = torch.where(
+                torch.arange(T).cuda() < num_valid_tokens, expert_indices, -1
+            )
+            partial_token_indices = torch.where(
+                torch.arange(T).cuda() < num_valid_tokens, token_indices, -1
+            )
+
         def torch_fn() -> Tuple[torch.Tensor, torch.Tensor]:
             shuffled_x = torch.index_select(x, dim=0, index=token_indices)
             shuffled_scores = torch.index_select(scores, dim=1, index=token_indices)
@@ -156,25 +197,37 @@ def triton_fn() -> Tuple[torch.Tensor, torch.Tensor]:
             if compiled:
                 op = torch.compile(op)
             test_output_q, test_output_scales = op(
-                x, token_indices, expert_indices, scores_, scale_ub
+                x,
+                partial_token_indices,
+                partial_expert_indices,
+                scores_,
+                scale_ub,
+                valid_token_count,
             )
             return test_output_q, test_output_scales
 
         test_output_q, test_output_scales = triton_fn()
         test_output = test_output_q.to(torch.float32) * test_output_scales.view(-1, 1)
 
-        torch.testing.assert_close(torch_output, test_output, atol=1e-3, rtol=1.6e-2)
+        torch.testing.assert_close(
+            torch_output[:num_valid_tokens],
+            test_output[:num_valid_tokens],
+            atol=1e-3,
+            rtol=1.6e-2,
+        )
 
     @given(
         num_tokens=st.sampled_from([1, 128, 2048, 4096, 16384]),
         dim=st.sampled_from([5120]),
+        partial=st.sampled_from([True, False]),
         compiled=st.sampled_from([True, False]),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=_MAX_SAMPLES, deadline=None)
     def test_scatter_add_dense_tokens(
         self,
         num_tokens: int,
         dim: int,
+        partial: bool,
         compiled: bool,
     ) -> None:
         torch.manual_seed(0)
@@ -190,6 +243,18 @@ def test_scatter_add_dense_tokens(
             torch.int32
         )
 
+        num_valid_tokens: int = num_tokens
+        valid_token_count: Optional[torch.Tensor] = None
+        partial_token_indices: torch.Tensor = token_indices
+        if partial:
+            num_valid_tokens = num_tokens // 2
+            valid_token_count = torch.tensor(
+                [num_valid_tokens], dtype=torch.int32, device="cuda"
+            )
+            partial_token_indices = torch.where(
+                torch.arange(num_tokens).cuda() < num_valid_tokens, token_indices, -1
+            )
+
         test_out_tokens: torch.Tensor = out_tokens.clone()
         ref_out_tokens: torch.Tensor = out_tokens.clone()
 
@@ -201,11 +266,12 @@ def fn() -> None:
                 test_out_tokens,
                 in_tokens,
                 token_indices,
+                valid_token_count,
             )
 
         fn()
 
-        token_indices: torch.Tensor = token_indices.to(torch.int64)
+        token_indices: torch.Tensor = token_indices[:num_valid_tokens].to(torch.int64)
 
         def ref_fn() -> None:
             ref_out_tokens.scatter_add_(
@@ -217,7 +283,10 @@ def ref_fn() -> None:
         ref_fn()
 
         torch.testing.assert_close(
-            test_out_tokens, ref_out_tokens, atol=1e-3, rtol=1.6e-2
+            test_out_tokens[:num_valid_tokens],
+            ref_out_tokens[:num_valid_tokens],
+            atol=1e-3,
+            rtol=1.6e-2,
         )
 
     @given(