diff --git a/.gitignore b/.gitignore index 2e7ec7f4ae3..090313a83c3 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ ehthumbs.db Thumbs.db *.swp .vscode +.vs diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d35815a295..a3e99e076fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -277,6 +277,12 @@ ELSEIF(CMAKE_C_COMPILER_ID STREQUAL "Clang") ENDIF() ELSEIF(CMAKE_C_COMPILER_ID STREQUAL "MSVC") SET(XNNPACK_ENABLE_AVX512BF16 OFF) + IF(MSVC_VERSION LESS_EQUAL 1941) + # /vlen option not supported + SET(XNNPACK_ENABLE_AVX256SKX OFF) + SET(XNNPACK_ENABLE_AVX256VNNI OFF) + SET(XNNPACK_ENABLE_AVX256VNNIGFNI OFF) + ENDIF() ENDIF() OPTION(XNNPACK_ENABLE_HVX "Build XNNPACK with Hexagon HVX micro-kernels" ON) OPTION(XNNPACK_ENABLE_KLEIDIAI "Use KleidiAI GEMM microkernels for Arm" ON) @@ -341,6 +347,8 @@ IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC") # Test files have many sections, increase the limit. See # https://learn.microsoft.com/en-us/cpp/build/reference/bigobj-increase-number-of-sections-in-dot-obj-file. ADD_COMPILE_OPTIONS("/bigobj") + # Our float16 datatypes have constructors in C++ and not in C, which prompts a warning in MSVC + ADD_COMPILE_OPTIONS("/wd4190") ENDIF() IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/src/configs/gemm-config.c b/src/configs/gemm-config.c index 41ebde38000..06249611e43 100644 --- a/src/configs/gemm-config.c +++ b/src/configs/gemm-config.c @@ -2799,7 +2799,11 @@ static void init_qd8_f16_qc8w_gemm_config(void) { qd8_f16_qc8w_gemm_config.pack_weights_and_biases = NULL; // Override the default packing function. qd8_f16_qc8w_gemm_config.packed_stride_weights_and_biases = NULL; // Override the default packing function. qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; + #if XNN_ENABLE_AVX256VNNI qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x64c4__avx256vnni_prfm; + #else + qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; + #endif qd8_f16_qc8w_gemm_config.mr = 16; qd8_f16_qc8w_gemm_config.nr = 64; qd8_f16_qc8w_gemm_config.log2_kr = 2; @@ -3389,7 +3393,11 @@ static void init_qd8_f32_qc8w_gemm_config(void) { qd8_f32_qc8w_gemm_config.pack_weights_and_biases = NULL; // Override the default packing function. qd8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = NULL; // Override the default packing function. qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; + #if XNN_ENABLE_AVX256VNNI qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x64c4__avx256vnni_prfm; + #else + qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; + #endif qd8_f32_qc8w_gemm_config.mr = 16; qd8_f32_qc8w_gemm_config.nr = 64; qd8_f32_qc8w_gemm_config.log2_kr = 2; @@ -4089,7 +4097,11 @@ static void init_qs8_qc8w_gemm_config(void) { qs8_qc8w_gemm_config.pack_weights_and_biases = NULL; // Override the default packing function. qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL; // Override the default packing function. qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; + #if XNN_ENABLE_AVX256VNNI qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x64c4__avx256vnni_prfm; + #else + qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; + #endif qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w; qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w; qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w; diff --git a/src/operator-run.c b/src/operator-run.c index d67548378e3..5b36d6d9194 100644 --- a/src/operator-run.c +++ b/src/operator-run.c @@ -2237,14 +2237,14 @@ void xnn_compute_f16_qd8_convert( const struct f16_qd8_convert_context context[restrict XNN_MIN_ELEMENTS(1)], size_t batch_index) { - return xnn_compute_f16_qx8_convert(context, xnn_f16_qd8_asymmetric_quantization_params, batch_index); + xnn_compute_f16_qx8_convert(context, xnn_f16_qd8_asymmetric_quantization_params, batch_index); } void xnn_compute_f16_qdu8_convert( const struct f16_qd8_convert_context context[restrict XNN_MIN_ELEMENTS(1)], size_t batch_index) { - return xnn_compute_f16_qx8_convert(context, xnn_f16_qdu8_asymmetric_quantization_params, batch_index); + xnn_compute_f16_qx8_convert(context, xnn_f16_qdu8_asymmetric_quantization_params, batch_index); } void xnn_compute_f32_qx8_convert( @@ -2273,14 +2273,14 @@ void xnn_compute_f32_qd8_convert( const struct f32_qd8_convert_context context[restrict XNN_MIN_ELEMENTS(1)], size_t batch_index) { - return xnn_compute_f32_qx8_convert(context, xnn_f32_qd8_asymmetric_quantization_params, batch_index); + xnn_compute_f32_qx8_convert(context, xnn_f32_qd8_asymmetric_quantization_params, batch_index); } void xnn_compute_f32_qdu8_convert( const struct f32_qd8_convert_context context[restrict XNN_MIN_ELEMENTS(1)], size_t batch_index) { - return xnn_compute_f32_qx8_convert(context, xnn_f32_qdu8_asymmetric_quantization_params, batch_index); + xnn_compute_f32_qx8_convert(context, xnn_f32_qdu8_asymmetric_quantization_params, batch_index); } void xnn_compute_pack_lh(