selective build sycltla kernel

xinyu-intel · xinyu-intel · commit c20e99e80641 · 2026-04-10T06:49:32.000+08:00
Signed-off-by: Xinyu Chen &lt;xinyu1.chen@intel.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -59,7 +59,9 @@ option(VLLM_XPU_ENABLE_XE_DEFAULT "Enable XE Default architecture kernels" ON)
 option(BASIC_KERNELS_ENABLED "Build basic kernels (_C extension)" ON)
 option(FA2_KERNELS_ENABLED
        "Build Flash Attention 2 kernels (_vllm_fa2_C extension)" ON)
-option(MOE_KERNELS_ENABLED "Build MoE kernels (_moe_C extension)" ON)
+option(MOE_KERNELS_ENABLED
+       "Build MoE kernels (_moe_C extension + grouped_gemm TLA)" ON)
+option(GDN_KERNELS_ENABLED "Build GDN attention kernels (gdn_attn TLA)" ON)
 option(XPU_SPECIFIC_KERNELS_ENABLED
        "Build XPU-specific kernels (_xpu_C extension)" ON)
 option(XPUMEM_ALLOCATOR_ENABLED "Build xpumem_allocator extension" ON)
@@ -72,6 +74,7 @@ message(STATUS "  VLLM_XPU_ENABLE_XE_DEFAULT   = ${VLLM_XPU_ENABLE_XE_DEFAULT}")
 message(STATUS "  BASIC_KERNELS_ENABLED        = ${BASIC_KERNELS_ENABLED}")
 message(STATUS "  FA2_KERNELS_ENABLED          = ${FA2_KERNELS_ENABLED}")
 message(STATUS "  MOE_KERNELS_ENABLED          = ${MOE_KERNELS_ENABLED}")
+message(STATUS "  GDN_KERNELS_ENABLED          = ${GDN_KERNELS_ENABLED}")
 message(
   STATUS "  XPU_SPECIFIC_KERNELS_ENABLED = ${XPU_SPECIFIC_KERNELS_ENABLED}")
 message(STATUS "  XPUMEM_ALLOCATOR_ENABLED     = ${XPUMEM_ALLOCATOR_ENABLED}")
@@ -331,23 +334,40 @@ if(BUILD_SYCL_TLA_KERNELS)
   # extensions shared library
   set(SYCL_TLA_COMPILE_OPTIONS "")
   if(VLLM_XPU_ENABLE_XE_DEFAULT)
-    add_subdirectory(csrc/xpu/grouped_gemm/xe_default)
-    list(APPEND GROUPED_GEMM_LIB_NAME "grouped_gemm_xe_default")
+    if(MOE_KERNELS_ENABLED)
+      add_subdirectory(csrc/xpu/grouped_gemm/xe_default)
+      list(APPEND GROUPED_GEMM_LIB_NAME "grouped_gemm_xe_default")
+    endif()
     list(APPEND SYCL_TLA_COMPILE_OPTIONS -DVLLM_XPU_ENABLE_XE_DEFAULT)
   endif()
   if(VLLM_XPU_ENABLE_XE2)
-    add_subdirectory(csrc/xpu/grouped_gemm/xe_2)
-    add_subdirectory(csrc/xpu/attn/xe_2)
-    add_subdirectory(csrc/xpu/gdn_attn/xe_2)
-    list(APPEND GROUPED_GEMM_LIB_NAME "grouped_gemm_xe_2")
-    list(APPEND ATTN_KERNEL_LIB_NAME "attn_kernels_xe_2")
-    list(APPEND GDN_ATTN_LIB_NAME "gdn_attn_kernels_xe_2")
+    if(MOE_KERNELS_ENABLED)
+      add_subdirectory(csrc/xpu/grouped_gemm/xe_2)
+      list(APPEND GROUPED_GEMM_LIB_NAME "grouped_gemm_xe_2")
+    endif()
+    if(FA2_KERNELS_ENABLED)
+      add_subdirectory(csrc/xpu/attn/xe_2)
+      list(APPEND ATTN_KERNEL_LIB_NAME "attn_kernels_xe_2")
+    endif()
+    if(GDN_KERNELS_ENABLED)
+      add_subdirectory(csrc/xpu/gdn_attn/xe_2)
+      list(APPEND GDN_ATTN_LIB_NAME "gdn_attn_kernels_xe_2")
+    endif()
     list(APPEND SYCL_TLA_COMPILE_OPTIONS -DVLLM_XPU_ENABLE_XE2)
   endif()
   list(APPEND VLLM_GPU_COMPILE_FLAGS ${SYCL_TLA_COMPILE_OPTIONS})
 
 endif()
 
+# Feature compile defines — these guard op registrations and interface code so
+# that disabled features don't pull in unbuilt TLA library symbols.
+if(MOE_KERNELS_ENABLED)
+  list(APPEND VLLM_GPU_COMPILE_FLAGS -DVLLM_MOE_ENABLED)
+endif()
+if(GDN_KERNELS_ENABLED)
+  list(APPEND VLLM_GPU_COMPILE_FLAGS -DVLLM_GDN_ENABLED)
+endif()
+
 # define vLLM XPU cmake variables
 
 set(VLLM_XPU_INCLUDE_DIR "")
@@ -505,9 +525,14 @@ if(XPU_SPECIFIC_KERNELS_ENABLED)
       "csrc/xpu/sampler/topk_topp_sampler.cpp"
       "csrc/xpu/sycl/deepseek_scaling_rope.cpp"
       "csrc/xpu/rand/exponential.cpp"
-      "csrc/xpu/grouped_gemm/grouped_gemm_interface.cpp"
-      "csrc/xpu/utils.cpp"
-      "csrc/xpu/gdn_attn/gdn_attn_interface.cpp")
+      "csrc/xpu/utils.cpp")
+  if(MOE_KERNELS_ENABLED)
+    list(APPEND VLLM_EXT_XPU_SRC
+         "csrc/xpu/grouped_gemm/grouped_gemm_interface.cpp")
+  endif()
+  if(GDN_KERNELS_ENABLED)
+    list(APPEND VLLM_EXT_XPU_SRC "csrc/xpu/gdn_attn/gdn_attn_interface.cpp")
+  endif()
   include_directories("/usr/include")
   # TODO: check if we need this flags list(APPEND VLLM_GPU_FLAGS
   # "-gline-tables-only")
diff --git a/csrc/xpu/ops.h b/csrc/xpu/ops.h
@@ -52,6 +52,7 @@ torch::Tensor int4_gemm_w4a8(
     const std::optional<torch::Tensor>& g_idx,
     const std::optional<torch::Tensor>& bias);
 
+#ifdef VLLM_MOE_ENABLED
 torch::Tensor cutlass_grouped_gemm_interface(
     torch::Tensor ptr_A,
     torch::Tensor ptr_B,
@@ -64,6 +65,7 @@ torch::Tensor cutlass_grouped_gemm_interface(
     int64_t num_experts,
     bool is_B_int4,
     bool is_B_mxfp4);
+#endif
 
 std::tuple<at::Tensor, at::Tensor> deepseek_scaling_rope(
     const at::Tensor& positions,
@@ -74,6 +76,7 @@ std::tuple<at::Tensor, at::Tensor> deepseek_scaling_rope(
     int64_t rotary_dim,
     bool is_neox);
 
+#ifdef VLLM_GDN_ENABLED
 void gdn_attention(
     torch::Tensor& core_attn_out,
     torch::Tensor& z,
@@ -98,6 +101,7 @@ void gdn_attention(
     const int64_t num_actual_tokens,
     const int64_t tp_size,
     const bool reorder_input);
+#endif
 
 bool is_bmg(int64_t device_index);
 
diff --git a/csrc/xpu/torch_bindings.cpp b/csrc/xpu/torch_bindings.cpp
@@ -1,6 +1,8 @@
 #include "core/registration.h"
 #include "xpu/ops.h"
-#include "xpu/grouped_gemm/grouped_gemm_interface.h"
+#ifdef VLLM_MOE_ENABLED
+  #include "xpu/grouped_gemm/grouped_gemm_interface.h"
+#endif
 #include "xpu/lora/lora_ops.h"
 
 #include <torch/library.h>
@@ -35,6 +37,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, xpu_ops) {
       "bias) -> Tensor");
   xpu_ops.impl("int4_gemm_w4a8", torch::kXPU, &int4_gemm_w4a8);
 
+#ifdef VLLM_MOE_ENABLED
   xpu_ops.def(
       "cutlass_grouped_gemm_interface(Tensor ptr_A, Tensor ptr_B, Tensor? "
       "ptr_scales, "
@@ -48,6 +51,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, xpu_ops) {
       "cutlass_grouped_gemm_interface",
       torch::kXPU,
       &cutlass_grouped_gemm_interface);
+#endif
 
   xpu_ops.def(
       "deepseek_scaling_rope(Tensor! positions, Tensor! query, Tensor! key, "
@@ -72,6 +76,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, xpu_ops) {
       "-> ()");
   xpu_ops.impl("bgmv_expand_slice", torch::kXPU, &bgmv_expand_slice);
 
+#ifdef VLLM_GDN_ENABLED
   xpu_ops.def(
       "gdn_attention(Tensor! core_attn_out, Tensor! z, Tensor "
       "projected_states_qkvz, Tensor projected_states_ba,"
@@ -83,6 +88,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, xpu_ops) {
       "Tensor non_spec_state_indices_tensor, int num_actual_tokens, int "
       "tp_size, bool reorder_input) -> ()");
   xpu_ops.impl("gdn_attention", torch::kXPU, &gdn_attention);
+#endif
 
   // for empty tensor functions, we don't need dispatch key like torch::kXPU
   xpu_ops.def("is_bmg(int device_index) -> bool");
diff --git a/setup.py b/setup.py
@@ -191,6 +191,7 @@ def configure(self, ext: CMakeExtension) -> None:
             "BASIC_KERNELS_ENABLED",
             "FA2_KERNELS_ENABLED",
             "MOE_KERNELS_ENABLED",
+            "GDN_KERNELS_ENABLED",
             "XPU_SPECIFIC_KERNELS_ENABLED",
             "XPUMEM_ALLOCATOR_ENABLED",
         ]
@@ -528,11 +529,14 @@ def build_extensions(self) -> None:
     if _is_enabled("VLLM_XPU_ENABLE_XE2"):
         if _is_enabled("FA2_KERNELS_ENABLED"):
             additional_libraries["attn_kernels_xe_2"] = "/csrc/xpu/attn/xe_2"
-        additional_libraries["gdn_attn_kernels_xe_2"] = (
-            "/csrc/xpu/gdn_attn/xe_2")
-        additional_libraries["grouped_gemm_xe_2"] = (
-            "/csrc/xpu/grouped_gemm/xe_2")
-    if _is_enabled("VLLM_XPU_ENABLE_XE_DEFAULT"):
+        if _is_enabled("GDN_KERNELS_ENABLED"):
+            additional_libraries["gdn_attn_kernels_xe_2"] = (
+                "/csrc/xpu/gdn_attn/xe_2")
+        if _is_enabled("MOE_KERNELS_ENABLED"):
+            additional_libraries["grouped_gemm_xe_2"] = (
+                "/csrc/xpu/grouped_gemm/xe_2")
+    if _is_enabled("VLLM_XPU_ENABLE_XE_DEFAULT") and _is_enabled(
+            "MOE_KERNELS_ENABLED"):
         additional_libraries["grouped_gemm_xe_default"] = (
             "/csrc/xpu/grouped_gemm/xe_default")
 
diff --git a/tools/envs.py b/tools/envs.py
@@ -121,6 +121,8 @@ def get_vllm_port() -> Optional[int]:
     lambda: os.getenv("FA2_KERNELS_ENABLED", "ON"),
     "MOE_KERNELS_ENABLED":
     lambda: os.getenv("MOE_KERNELS_ENABLED", "ON"),
+    "GDN_KERNELS_ENABLED":
+    lambda: os.getenv("GDN_KERNELS_ENABLED", "ON"),
     "XPU_SPECIFIC_KERNELS_ENABLED":
     lambda: os.getenv("XPU_SPECIFIC_KERNELS_ENABLED", "ON"),
     "XPUMEM_ALLOCATOR_ENABLED":