@@ -103,6 +103,7 @@ cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention ke
103103option (onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF )
104104cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF )
105105option (onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF )
106+ option (onnxruntime_USE_INT4_KV_CACHE "Build cuda kernels for int4 kv cache" OFF )
106107option (onnxruntime_QUICK_BUILD "Speed up build by skipping some kernels for faster development" OFF )
107108
108109option (onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF )
@@ -125,6 +126,7 @@ option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
125126
126127option (onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF )
127128option (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF )
129+ option (onnxruntime_DUMP_TENSOR "Dump tensor inside kernel." OFF )
128130cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF )
129131
130132# When loading a delay loaded DLL, Windows searches the main EXE's folder first.
@@ -627,7 +629,6 @@ else()
627629 check_cxx_compiler_flag(-Wparentheses HAS_PARENTHESES)
628630 check_cxx_compiler_flag(-Wshorten-64-to-32 HAS_SHORTEN_64_TO_32)
629631 check_cxx_compiler_flag(-Wstrict-aliasing HAS_STRICT_ALIASING)
630- check_nvcc_compiler_flag(-Wstrict-aliasing NVCC_HAS_STRICT_ALIASING)
631632 check_cxx_compiler_flag(-Wstringop-overflow HAS_STRINGOP_OVERFLOW)
632633 check_cxx_compiler_flag(-Wtautological-pointer-compare HAS_TAUTOLOGICAL_POINTER_COMPARE)
633634 check_cxx_compiler_flag(-Wundefined-var-template HAS_UNDEFINED_VAR_TEMPLATE)
@@ -774,8 +775,13 @@ if (onnxruntime_USE_CUDA)
774775 endif ()
775776
776777 if (onnxruntime_QUICK_BUILD)
777- message ( STATUS "Quick build mode: Flash attention limited to fp16 only" )
778- list (APPEND ORT_PROVIDER_FLAGS -DORT_QUICK_BUILD=1)
778+ message ( STATUS "Quick build mode: Flash attention limited to head dimension 128 only" )
779+ list (APPEND ORT_PROVIDER_FLAGS -DORT_QUICK_BUILD=1)
780+ endif ()
781+
782+ if (onnxruntime_USE_INT4_KV_CACHE)
783+ message ( STATUS "Enable int4 kv cache for CUDA EP" )
784+ list (APPEND ORT_PROVIDER_FLAGS -DUSE_INT4_KV_CACHE=1)
779785 endif ()
780786endif ()
781787
@@ -1433,6 +1439,9 @@ if (Git_FOUND)
14331439 if (onnxruntime_QUICK_BUILD)
14341440 string (APPEND ORT_BUILD_INFO "quick-build=1, " )
14351441 endif ()
1442+ if (onnxruntime_USE_INT4_KV_CACHE)
1443+ string (APPEND ORT_BUILD_INFO "int4-kv-cache=1, " )
1444+ endif ()
14361445endif ()
14371446string (APPEND ORT_BUILD_INFO "build type=${CMAKE_BUILD_TYPE} " )
14381447configure_file (onnxruntime_config.h.in ${CMAKE_CURRENT_BINARY_DIR} /onnxruntime_config.h)
@@ -1441,11 +1450,13 @@ get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_
14411450if (onnxruntime_USE_CUDA)
14421451 set (CMAKE_CUDA_STANDARD 17)
14431452 if (onnxruntime_CUDA_HOME)
1444- file (TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME} )
1453+ file (TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT )
14451454 endif ()
14461455 find_package (CUDAToolkit REQUIRED)
14471456
14481457 if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.8)
1458+ add_definitions ("-DENABLE_BF16" )
1459+ message (STATUS "CUDA Toolkit version is greater or equal than 11.8, enable -DENABLE_BF16 flag" )
14491460 add_definitions ("-DENABLE_FP8" )
14501461 message (STATUS "CUDA Toolkit version is greater or equal than 11.8, enable -DENABLE_FP8 flag" )
14511462 endif ()
@@ -1779,6 +1790,10 @@ if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS)
17791790 add_compile_definitions (DEBUG_NODE_INPUTS_OUTPUTS)
17801791endif ()
17811792
1793+ if (onnxruntime_DUMP_TENSOR)
1794+ add_compile_definitions (DUMP_TENSOR_LEVEL=1)
1795+ endif ()
1796+
17821797if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
17831798 if (NOT CMAKE_SYSTEM_NAME STREQUAL "Linux" )
17841799 message (FATAL_ERROR "External custom operator schemas feature is only supported on Linux" )
@@ -1801,8 +1816,11 @@ if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
18011816 )
18021817endif ()
18031818
1804- if (NOT onnxruntime_BUILD_SHARED_LIB AND onnxruntime_USE_WEBGPU)
1805- message (WARNING "CMake target files will not be generated for static onnxruntime builds with webgpu support" )
1819+ if (NOT onnxruntime_BUILD_SHARED_LIB AND
1820+ (onnxruntime_USE_WEBGPU OR (CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_USE_XNNPACK)))
1821+ message (WARNING
1822+ "CMake target files will not be generated for static onnxruntime builds "
1823+ "with WebGPU or Emscripten+XNNPACK support" )
18061824else ()
18071825 # Install
18081826 include (CMakePackageConfigHelpers)
0 commit comments