forked from vllm-project/vllm-xpu-kernels
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCMakeLists.txt
More file actions
602 lines (531 loc) · 19.4 KB
/
CMakeLists.txt
File metadata and controls
602 lines (531 loc) · 19.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
cmake_minimum_required(VERSION 3.26)
# When building directly using CMake, make sure you run the install step (it
# places the .so files in the correct location).
#
# Example: mkdir build && cd build cmake -G Ninja
# -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. .. cmake
# --build . --target install
#
# If you want to only build one target, make sure to install it manually: cmake
# --build . --target _C cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
# XPU by default, used by setup.py
set(VLLM_TARGET_DEVICE
"xpu"
CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported Intel GPU architectures.
set(SYCL_SUPPORTED_ARCHS "intel_gpu_pvc;intel_gpu_bmg_g21;intel_gpu_bmg_g31")
#
# Supported/expected torch versions for XPU.
#
# Currently, having an incorrect pytorch version results in a warning rather
# than an error.
#
# TODO: we need to align torch version with used in vLLM.
#
set(TORCH_SUPPORTED_VERSION_XPU "2.9.0")
set(BUILD_SYCL_TLA_KERNELS
ON
CACHE BOOL "Build SYCL-TLA based kernels for XPU")
# ARCHITECTURE OPTIONS
option(VLLM_XPU_ENABLE_XE2 "Enable XE2 architecture kernels" ON)
option(VLLM_XPU_ENABLE_XE_DEFAULT "Enable XE Default architecture kernels" ON)
# KERNEL OPTIONS — each controls whether the corresponding Python extension is
# built. Override from the command line with -D<NAME>=OFF or via the
# identically-named environment variable (read by setup.py).
option(BASIC_KERNELS_ENABLED "Build basic kernels (_C extension)" ON)
option(FA2_KERNELS_ENABLED
"Build Flash Attention 2 kernels (_vllm_fa2_C extension)" ON)
option(MOE_KERNELS_ENABLED
"Build MoE kernels (_moe_C extension + grouped_gemm TLA)" ON)
option(GDN_KERNELS_ENABLED "Build GDN attention kernels (gdn_attn TLA)" ON)
option(XPU_SPECIFIC_KERNELS_ENABLED
"Build XPU-specific kernels (_xpu_C extension)" ON)
option(XPUMEM_ALLOCATOR_ENABLED "Build xpumem_allocator extension" ON)
message(STATUS "")
message(STATUS "Kernel build configuration:")
message(STATUS " BUILD_SYCL_TLA_KERNELS = ${BUILD_SYCL_TLA_KERNELS}")
message(STATUS " VLLM_XPU_ENABLE_XE2 = ${VLLM_XPU_ENABLE_XE2}")
message(STATUS " VLLM_XPU_ENABLE_XE_DEFAULT = ${VLLM_XPU_ENABLE_XE_DEFAULT}")
message(STATUS " BASIC_KERNELS_ENABLED = ${BASIC_KERNELS_ENABLED}")
message(STATUS " FA2_KERNELS_ENABLED = ${FA2_KERNELS_ENABLED}")
message(STATUS " MOE_KERNELS_ENABLED = ${MOE_KERNELS_ENABLED}")
message(STATUS " GDN_KERNELS_ENABLED = ${GDN_KERNELS_ENABLED}")
message(
STATUS " XPU_SPECIFIC_KERNELS_ENABLED = ${XPU_SPECIFIC_KERNELS_ENABLED}")
message(STATUS " XPUMEM_ALLOCATOR_ENABLED = ${XPUMEM_ALLOCATOR_ENABLED}")
message(STATUS "")
#
# Try to find python package with an executable that exactly matches
# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
#
if(VLLM_PYTHON_EXECUTABLE)
find_python_from_executable(${VLLM_PYTHON_EXECUTABLE}
"${PYTHON_SUPPORTED_VERSIONS}")
else()
message(
FATAL_ERROR
"Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
" before running cmake configure.")
endif()
#
# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
#
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
#
# Import torch cmake configuration.
find_package(Torch REQUIRED)
find_package(oneDNN REQUIRED)
#
# Forward the non-CUDA device extensions to external CMake scripts.
#
if(NOT VLLM_TARGET_DEVICE STREQUAL "xpu")
message(STATUS "Not support building non-XPU device extensions.")
return()
endif()
#
# Set up GPU language and check the torch version and warn if it isn't what is
# expected.
#
if(VLLM_TARGET_DEVICE STREQUAL "xpu")
message(STATUS "Building XPU")
set(VLLM_GPU_LANG "SYCL")
else()
message(FATAL_ERROR "Can't find non-XPU installation.")
endif()
if(VLLM_TARGET_DEVICE STREQUAL "xpu")
#
# For other GPU targets override the GPU architectures detected by cmake/torch
# and filter them by the supported versions for the current language. The
# final set of arches is stored in `VLLM_GPU_ARCHES`.
#
# TODO: add sycl architectures
override_gpu_arches(VLLM_GPU_ARCHES ${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
endif()
#
# Query torch for additional GPU compilation flags for the given
# `VLLM_GPU_LANG`. The final set of arches is stored in `VLLM_GPU_FLAGS`.
#
message(STATUS "Querying torch for GPU compiler flags for ${VLLM_GPU_LANG}...")
get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
message(STATUS "Torch GPU compiler flags: ${VLLM_GPU_FLAGS}")
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's
# build process. setup.py will override FETCHCONTENT_BASE_DIR to play nicely
# with sccache. Each dependency that produces build artifacts should override
# its BINARY_DIR to avoid conflicts between build types. It should instead be
# set to ${CMAKE_BINARY_DIR}/<dependency>.
#
include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
# Set RPATH to $ORIGIN so that the extensions can find their shared libraries
# when loaded by Python.
set(CMAKE_INSTALL_RPATH "$ORIGIN")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
if(VLLM_GPU_LANG STREQUAL "SYCL")
#
# For SYCL we want to use the same flags as CUDA, so we set them here. Note
# that SYCL does not support all CUDA flags, so some of them will be ignored.
#
# TODO: check SYCL flags
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS}")
set(SYCL_FIRST_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/csrc/sycl_first.h")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include ${SYCL_FIRST_HEADER}")
# AOT devices can be overridden via environment variables:
# VLLM_XPU_AOT_DEVICES and VLLM_XPU_XE2_AOT_DEVICES Example: export
# VLLM_XPU_AOT_DEVICES="pvc,bmg-g21-a0" export
# VLLM_XPU_XE2_AOT_DEVICES="pvc,bmg-g31-a0"
set(AOT_DEVICES "pvc,bmg,bmg-g21-a0,bmg-g31-a0")
set(XE2_AOT_DEVICES "pvc,bmg,bmg-g21-a0,bmg-g31-a0")
# Allow overriding via env, including explicitly disabling AOT by setting an
# empty env var (e.g. export VLLM_XPU_AOT_DEVICES="").
if(DEFINED ENV{VLLM_XPU_AOT_DEVICES})
set(AOT_DEVICES $ENV{VLLM_XPU_AOT_DEVICES})
endif()
if(DEFINED ENV{VLLM_XPU_XE2_AOT_DEVICES})
set(XE2_AOT_DEVICES $ENV{VLLM_XPU_XE2_AOT_DEVICES})
endif()
# ============= COMPILE OPTIONS ==================
set(SYCL_FLAGS "")
set(SYCL_KERNEL_OPTIONS)
list(APPEND SYCL_FLAGS "-fsycl")
set(SYCL_COMPILE_FLAGS ${SYCL_FLAGS})
set(SYCL_COMPILE_FLAGS ${SYCL_COMPILE_FLAGS} "-O3" "-DNDEBUG")
if(AOT_DEVICES)
set(SYCL_COMPILE_FLAGS ${SYCL_COMPILE_FLAGS} -fsycl-targets=spir64_gen)
endif()
# Final build option be like: icpx -fsycl -fsycl-target=spir64_gen
# ${SYCL_KERNEL_OPTIONS} -fsycl-host-compiler=gcc
# -fsycl-host-compiler-options='${CMAKE_HOST_FLAGS}' kernel.cpp -o kernel.o
set(VLLM_GPU_COMPILE_FLAGS ${SYCL_COMPILE_FLAGS})
# ============= LINK OPTIONS ==================
set(SYCL_LINK_FLAGS "")
list(APPEND SYCL_LINK_FLAGS "-fsycl")
set(SYCL_DEVICE_LINK_FLAGS ${SYCL_LINK_FLAGS})
set(SYCL_DEVICE_LINK_FLAGS
${SYCL_DEVICE_LINK_FLAGS} -fsycl-max-parallel-link-jobs=16
-flink-huge-device-code)
set(SYCL_DEVICE_LINK_FLAGS
${SYCL_DEVICE_LINK_FLAGS}
"-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate"
)
if(AOT_DEVICES)
list(APPEND SYCL_DEVICE_LINK_FLAGS -fsycl-targets=spir64_gen
-Xsycl-target-backend=spir64_gen)
endif()
# Final link option be like: icpx -fsycl -fsycl-target=spir64_gen
# ${SYCL_DEVICE_LINK_FLAGS} -Xsycl-target-backend=spir64_gen
# '${SYCL_OFFLINE_COMPILER_FLAGS}' kernel.o -o device-code.o
set(FINAL_LINK_OPTIONS "")
list(APPEND FINAL_LINK_OPTIONS ${SYCL_DEVICE_LINK_FLAGS})
if(AOT_DEVICES)
list(APPEND FINAL_LINK_OPTIONS "-device ${AOT_DEVICES}")
endif()
set(VLLM_GPU_LINK_FLAGS ${FINAL_LINK_OPTIONS})
message(STATUS "Final SYCL compile options: ${VLLM_GPU_COMPILE_FLAGS}")
message(STATUS "Final SYCL link options: ${VLLM_GPU_LINK_FLAGS}")
endif()
# sycl-tla setup. Fetch SYCL-TLA code base, and setup related FLAGs
if(VLLM_GPU_LANG STREQUAL "SYCL")
message(STATUS "Setting up SYCL-TLA dependency...")
# add cutlass dependency
set(CUTLASS_ENABLE_HEADERS_ONLY
"ON"
CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages
# when building.
set(CUTLASS_REVISION
"cd763790ad2f74d7294435ecf77682bac0062c3a"
CACHE STRING "CUTLASS revision to use")
if(DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
endif()
if(VLLM_CUTLASS_SRC_DIR)
if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}"
ABSOLUTE)
endif()
message(
STATUS
"The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation"
)
FetchContent_Declare(cutlass-sycl SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
else()
# Use the specified CUTLASS source directory for compilation if
# VLLM_CUTLASS_SRC_DIR is provided
FetchContent_Declare(
cutlass-sycl
GIT_REPOSITORY https://github.com/intel/sycl-tla.git
# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG ${CUTLASS_REVISION}
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG
# instead of the history. Important: If GIT_SHALLOW is enabled then
# GIT_TAG works only with branch names and tags. So if the GIT_TAG above
# is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW FALSE)
endif()
# cutlass compilation flags
set(CUTLASS_ENABLE_SYCL "ON")
# TODO: make this a list
set(DPCPP_SYCL_TARGET
"intel_gpu_bmg_g21"
CACHE STRING "DPC++ SYCL target architectures")
set(CMAKE_EXPORT_COMPILE_COMMANDS "ON")
set(CUTLASS_ENABLE_BENCHMARKS "OFF")
# disable cuda
set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT
OFF
CACHE BOOL "DISABLE CUDA")
FetchContent_MakeAvailable(cutlass-sycl)
set(CUTLASS_INCLUDE_DIR
${cutlass-sycl_SOURCE_DIR}/include
CACHE PATH "CUTLASS Header Library")
set(CUTLASS_TOOLS_UTIL_INCLUDE_DIR
${cutlass-sycl_SOURCE_DIR}/tools/util/include
CACHE INTERNAL "")
set(CUTLASS_APP_INCLUDE_DIR
${cutlass-sycl_SOURCE_DIR}/applications
CACHE INTERNAL "")
message(
STATUS
"cutlass dir: ${CUTLASS_INCLUDE_DIR} and ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} and ${CUTLASS_APP_INCLUDE_DIR}"
)
# header only library
list(APPEND VLLM_CUTLASS_FLAGS "-DCUTLASS_ENABLE_HEADERS_ONLY")
list(APPEND VLLM_CUTLASS_FLAGS "-DCUTLASS_ENABLE_SYCL")
list(APPEND VLLM_CUTLASS_FLAGS "-DSYCL_INTEL_TARGET")
list(APPEND VLLM_CUTLASS_FLAGS "-DCUTLASS_VERSIONS_GENERATED")
list(APPEND VLLM_CUTLASS_FLAGS "-ftemplate-backtrace-limit=0")
list(APPEND VLLM_CUTLASS_FLAGS "-fdiagnostics-color=always")
endif()
set(ATTN_KERNEL_LIB_NAME "")
set(GROUPED_GEMM_LIB_NAME "")
set(GDN_ATTN_LIB_NAME "")
if(BUILD_SYCL_TLA_KERNELS)
set(SYCL_TLA_INCLUDE_DIRS
${CUTLASS_INCLUDE_DIR} ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
${CUTLASS_APP_INCLUDE_DIR})
set(SYCL_TLA_KERNELS_COMPILE_FLAGS ${VLLM_GPU_COMPILE_FLAGS})
list(APPEND SYCL_TLA_KERNELS_COMPILE_FLAGS ${VLLM_CUTLASS_FLAGS})
list(APPEND SYCL_TLA_KERNELS_COMPILE_FLAGS "-fno-sycl-instrument-device-code")
# compile static library for attn and grouped_gemm, link this library to vLLM
# extensions shared library
set(SYCL_TLA_COMPILE_OPTIONS "")
if(VLLM_XPU_ENABLE_XE_DEFAULT)
if(MOE_KERNELS_ENABLED)
add_subdirectory(csrc/xpu/grouped_gemm/xe_default)
list(APPEND GROUPED_GEMM_LIB_NAME "grouped_gemm_xe_default")
endif()
list(APPEND SYCL_TLA_COMPILE_OPTIONS -DVLLM_XPU_ENABLE_XE_DEFAULT)
endif()
if(VLLM_XPU_ENABLE_XE2)
if(MOE_KERNELS_ENABLED)
add_subdirectory(csrc/xpu/grouped_gemm/xe_2)
list(APPEND GROUPED_GEMM_LIB_NAME "grouped_gemm_xe_2")
endif()
if(FA2_KERNELS_ENABLED)
add_subdirectory(csrc/xpu/attn/xe_2)
list(APPEND ATTN_KERNEL_LIB_NAME "attn_kernels_xe_2")
endif()
if(GDN_KERNELS_ENABLED)
add_subdirectory(csrc/xpu/gdn_attn/xe_2)
list(APPEND GDN_ATTN_LIB_NAME "gdn_attn_kernels_xe_2")
endif()
list(APPEND SYCL_TLA_COMPILE_OPTIONS -DVLLM_XPU_ENABLE_XE2)
endif()
list(APPEND VLLM_GPU_COMPILE_FLAGS ${SYCL_TLA_COMPILE_OPTIONS})
endif()
# Feature compile defines — these guard op registrations and interface code so
# that disabled features don't pull in unbuilt TLA library symbols.
if(MOE_KERNELS_ENABLED)
list(APPEND VLLM_GPU_COMPILE_FLAGS -DVLLM_MOE_ENABLED)
endif()
if(GDN_KERNELS_ENABLED)
list(APPEND VLLM_GPU_COMPILE_FLAGS -DVLLM_GDN_ENABLED)
endif()
# define vLLM XPU cmake variables
set(VLLM_XPU_INCLUDE_DIR "")
list(APPEND VLLM_XPU_INCLUDE_DIR ${CMPLR_ROOT}/include/)
list(APPEND VLLM_XPU_INCLUDE_DIR ${CMPLR_ROOT}/include/sycl/)
list(APPEND VLLM_XPU_INCLUDE_DIR ${CMPLR_ROOT}/include/syclcompat/)
message(STATUS "VLLM_XPU_INCLUDE_DIR: ${VLLM_XPU_INCLUDE_DIR}")
set(VLLM_XPU_LINK_LIBRARIES "")
list(
APPEND
VLLM_XPU_LINK_LIBRARIES
"sycl"
"OpenCL"
"pthread"
"m"
"dl"
"torch")
#
# xpumem_allocator extension - XPU memory allocator with Python callbacks
#
if(XPUMEM_ALLOCATOR_ENABLED)
message(STATUS "Enabling xpumem_allocator extension.")
set(XPUMEM_ALLOCATOR_SRC "csrc/utils/mem_alloc.cpp")
include_directories("/usr/include")
# Create Python module for xpumem_allocator
find_package(Python REQUIRED COMPONENTS Development)
find_package(Torch REQUIRED)
list(APPEND PYTORCH_INCLUDES ${TORCH_INCLUDE_DIRS})
python_add_library(xpumem_allocator MODULE USE_SABI 3 WITH_SOABI
"${XPUMEM_ALLOCATOR_SRC}")
set_property(TARGET xpumem_allocator PROPERTY CXX_STANDARD 17)
if(VLLM_GPU_LANG STREQUAL "SYCL")
target_compile_options(xpumem_allocator PRIVATE ${VLLM_GPU_COMPILE_FLAGS})
endif()
get_target_property(EXISTING_DEFS xpumem_allocator COMPILE_DEFINITIONS)
if(EXISTING_DEFS)
list(REMOVE_ITEM EXISTING_DEFS "Py_LIMITED_API=3")
set_target_properties(xpumem_allocator PROPERTIES COMPILE_DEFINITIONS
"${EXISTING_DEFS}")
endif()
target_include_directories(xpumem_allocator PRIVATE ${Python_INCLUDE_DIRS}
${PYTORCH_INCLUDES} csrc)
target_link_libraries(xpumem_allocator PRIVATE torch ${TORCH_LIBRARIES})
if(VLLM_GPU_LANG STREQUAL "SYCL")
target_link_options(xpumem_allocator PRIVATE ${VLLM_GPU_LINK_FLAGS})
endif()
install(TARGETS xpumem_allocator LIBRARY DESTINATION vllm_xpu_kernels
COMPONENT xpumem_allocator)
endif()
#
# Define other extension targets
#
#
# _C extension
#
if(BASIC_KERNELS_ENABLED)
message(STATUS "Enabling C extension.")
set(VLLM_EXT_SRC
"csrc/cache.cpp"
"csrc/layernorm.cpp"
"csrc/layernorm_quant.cpp"
"csrc/activation.cpp"
"csrc/pos_encoding_kernels.cpp"
"csrc/fused_qknorm_rope.cpp"
"csrc/torch_bindings.cpp"
"csrc/quantization/fp8/fp8_quant.cpp"
"csrc/quantization/fp4/mxfp4_quant.cpp"
"csrc/xpu_view.cpp"
"csrc/attention/merge_attn_states.cpp"
"csrc/tensor_utils.cpp"
"csrc/utils/mem_cpy.cpp"
"csrc/topk_per_row.cpp")
include_directories("/usr/include")
define_gpu_extension_target(
_C
DESTINATION
vllm_xpu_kernels
LANGUAGE
${VLLM_GPU_LANG}
SOURCES
${VLLM_EXT_SRC}
COMPILE_FLAGS
${VLLM_GPU_COMPILE_FLAGS}
LINK_FLAGS
${VLLM_GPU_LINK_FLAGS}
ARCHITECTURES
${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES
${VLLM_XPU_INCLUDE_DIR}
USE_SABI
3
WITH_SOABI)
endif()
#
# flash attention _C extension
#
if(FA2_KERNELS_ENABLED AND NOT ATTN_KERNEL_LIB_NAME)
message(
WARNING
"FA2_KERNELS_ENABLED is ON but no attention kernel libraries are available. "
"The _vllm_fa2_C extension will be built without kernel libraries and may "
"fail at runtime. Enable BUILD_SYCL_TLA_KERNELS and VLLM_XPU_ENABLE_XE2 "
"for full FA2 functionality.")
endif()
if(FA2_KERNELS_ENABLED)
message(STATUS "Enabling fa2 extension.")
file(GLOB FA2_GEN_SRCS "csrc/flash_attn/*.cpp"
"csrc/xpu/attn/attn_interface.cpp")
define_gpu_extension_target(
_vllm_fa2_C
DESTINATION
vllm_xpu_kernels
LANGUAGE
${VLLM_GPU_LANG}
SOURCES
${FA2_GEN_SRCS}
COMPILE_FLAGS
${VLLM_GPU_COMPILE_FLAGS}
LINK_FLAGS
${VLLM_GPU_LINK_FLAGS}
ARCHITECTURES
${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES
${VLLM_XPU_INCLUDE_DIR}
USE_SABI
3
LIBRARIES
${ATTN_KERNEL_LIB_NAME}
WITH_SOABI)
endif()
#
# xpu only ops/kernels, implemented with cutlass/onednn/sycl.
#
if(XPU_SPECIFIC_KERNELS_ENABLED)
message(STATUS "Enabling _xpu_C extension.")
set(VLLM_EXT_XPU_SRC
"csrc/xpu/torch_bindings.cpp"
"csrc/xpu/lora/lora_shrink.cpp"
"csrc/xpu/lora/lora_expand.cpp"
"csrc/xpu/sampler/topk_topp_sampler.cpp"
"csrc/xpu/sycl/deepseek_scaling_rope.cpp"
"csrc/xpu/rand/exponential.cpp"
"csrc/xpu/utils.cpp")
if(MOE_KERNELS_ENABLED)
list(APPEND VLLM_EXT_XPU_SRC
"csrc/xpu/grouped_gemm/grouped_gemm_interface.cpp")
endif()
if(GDN_KERNELS_ENABLED)
list(APPEND VLLM_EXT_XPU_SRC "csrc/xpu/gdn_attn/gdn_attn_interface.cpp")
endif()
include_directories("/usr/include")
# TODO: check if we need this flags list(APPEND VLLM_GPU_FLAGS
# "-gline-tables-only")
if(ONEDNN_FOUND)
set(_ONEDNN_SRC)
file(GLOB _ONEDNN_SRC csrc/xpu/onednn/*.cpp)
list(APPEND VLLM_EXT_XPU_SRC ${_ONEDNN_SRC})
include_directories(${ONEDNN_INCLUDE_DIR})
link_libraries(${ONEDNN_LIBRARY})
else()
message(
FATAL_ERROR "onednn not found but xpu specific kernels are enabled.")
endif()
define_gpu_extension_target(
_xpu_C
DESTINATION
vllm_xpu_kernels
LANGUAGE
${VLLM_GPU_LANG}
SOURCES
${VLLM_EXT_XPU_SRC}
COMPILE_FLAGS
${VLLM_GPU_COMPILE_FLAGS}
LINK_FLAGS
${VLLM_GPU_LINK_FLAGS}
ARCHITECTURES
${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES
${VLLM_XPU_INCLUDE_DIR}
LIBRARIES
${GROUPED_GEMM_LIB_NAME};${GDN_ATTN_LIB_NAME}
USE_SABI
3
WITH_SOABI)
endif()
#
# _moe_C extension
#
if(MOE_KERNELS_ENABLED)
message(STATUS "Enabling _moe_C extension.")
file(GLOB VLLM_MOE_EXT_SRC "csrc/moe/*.cpp")
define_gpu_extension_target(
_moe_C
DESTINATION
vllm_xpu_kernels
LANGUAGE
${VLLM_GPU_LANG}
SOURCES
${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS
${VLLM_GPU_COMPILE_FLAGS}
LINK_FLAGS
${VLLM_GPU_LINK_FLAGS}
ARCHITECTURES
${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES
${VLLM_XPU_INCLUDE_DIR}
USE_SABI
3
WITH_SOABI)
endif()