Add deepgemm warm up and handle import by ENABLE_JIT_DEEPGEMM

zianglih · zianglih · commit 6ce61dafe346 · 2026-02-20T14:01:34.000-08:00
diff --git a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
@@ -230,18 +230,14 @@ def _with_real_sm_count(self):
             yield
 
     def _weights_proj_bf16_in_fp32_out(self, x: torch.Tensor) -> torch.Tensor:
-        try:
-            from deep_gemm import bf16_gemm_nt
-        except ImportError:
-            bf16_gemm_nt = None
-        if bf16_gemm_nt is not None:
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
             weight = self.weights_proj.weight
             out = torch.empty(
                 (x.shape[0], weight.shape[0]),
                 dtype=torch.float32,
                 device=x.device,
             )
-            bf16_gemm_nt(x, weight, out)
+            deep_gemm_wrapper.gemm_nt_bf16bf16f32(x, weight, out)
             return out
 
         if _is_hip:
diff --git a/python/sglang/srt/layers/deep_gemm_wrapper/compile_utils.py b/python/sglang/srt/layers/deep_gemm_wrapper/compile_utils.py
@@ -96,6 +96,7 @@ class DeepGemmKernelType(IntEnum):
     GROUPED_GEMM_NT_F8F8BF16_MASKED = auto()
     GROUPED_GEMM_NT_F8F8BF16_CONTIG = auto()
     GEMM_NT_F8F8BF16 = auto()
+    GEMM_NT_BF16BF16F32 = auto()
 
 
 _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
@@ -216,6 +217,7 @@ def create(kernel_type: DeepGemmKernelType, **kwargs):
             DeepGemmKernelType.GEMM_NT_F8F8BF16: _NormalWarmupExecutor,
             DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG: _GroupedContWarmupExecutor,
             DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: _GroupedMaskedWarmupExecutor,
+            DeepGemmKernelType.GEMM_NT_BF16BF16F32: _BF16F32WarmupExecutor,
         }[kernel_type](**kwargs)
 
     @staticmethod
@@ -235,6 +237,9 @@ def get_memory_requirement(
                 + num_groups * 4
                 + num_groups * max_m * n * 2
             ) / _GB
+        elif kernel_type == DeepGemmKernelType.GEMM_NT_BF16BF16F32:
+            # bf16 lhs + bf16 rhs + fp32 out
+            return (max_m * k * 2 + n * k * 2 + max_m * n * 4) / _GB
         else:
             raise ValueError(f"Invalid kernel type: {kernel_type}")
 
@@ -317,6 +322,16 @@ def execute(self, m):
         )
 
 
+class _BF16F32WarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs = torch.empty((max_m, k), device="cuda", dtype=torch.bfloat16)
+        self.rhs = torch.empty((n, k), device="cuda", dtype=torch.bfloat16)
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.float32)
+
+    def execute(self, m):
+        deep_gemm.bf16_gemm_nt(self.lhs[:m], self.rhs, self.out[:m])
+
+
 @contextmanager
 def deep_gemm_execution_hook(
     m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
diff --git a/python/sglang/srt/layers/deep_gemm_wrapper/entrypoint.py b/python/sglang/srt/layers/deep_gemm_wrapper/entrypoint.py
@@ -102,6 +102,20 @@ def gemm_nt_f8f8bf16(
         )
 
 
+def gemm_nt_bf16bf16f32(
+    lhs: torch.Tensor,
+    rhs: torch.Tensor,
+    out: torch.Tensor,
+):
+    m, k = lhs.shape
+    n, _ = rhs.shape
+    num_groups = 1
+    kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_BF16BF16F32
+
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        deep_gemm.bf16_gemm_nt(lhs, rhs, out)
+
+
 def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
     compile_utils.update_deep_gemm_config(gpu_id, server_args)