From 5f1618591bacb79658b2d87b64cd0ec99117e216 Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Wed, 21 Aug 2024 13:28:35 +0800 Subject: [PATCH 1/8] Enable relaxed simd build Build with --enable_wasm_relaxed_simd. A trick here is to rename mjs file like: cp ../../../build_wasm/Release/ort-wasm-relaxedsimd-threaded.mjs ./ort-wasm-simd-threaded.mjs cp ../../../build_wasm/Release/ort-wasm-relaxedsimd-threaded.wasm . --- cmake/adjust_global_compile_flags.cmake | 5 ++++- cmake/external/xnnpack.cmake | 7 ++++++- cmake/onnxruntime_mlas.cmake | 2 +- cmake/onnxruntime_webassembly.cmake | 4 +++- tools/ci_build/build.py | 2 ++ 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake index 2aa83e9e3ee96..2ad9ba55f3269 100644 --- a/cmake/adjust_global_compile_flags.cmake +++ b/cmake/adjust_global_compile_flags.cmake @@ -35,7 +35,10 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(CMAKE_CXX_FLAGS_DEBUG "-g2") endif() - if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) + if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) + string(APPEND CMAKE_C_FLAGS " -msimd128 -mrelaxed-simd") + string(APPEND CMAKE_CXX_FLAGS " -msimd128 -mrelaxed-simd") + elseif (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) string(APPEND CMAKE_C_FLAGS " -msimd128") string(APPEND CMAKE_CXX_FLAGS " -msimd128") endif() diff --git a/cmake/external/xnnpack.cmake b/cmake/external/xnnpack.cmake index 02ef9a198a803..d0ab770053be1 100644 --- a/cmake/external/xnnpack.cmake +++ b/cmake/external/xnnpack.cmake @@ -143,7 +143,12 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/scalar.c) list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasm.c) - if(onnxruntime_ENABLE_WEBASSEMBLY_SIMD) + if(onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) + list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c) + list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmrelaxedsimd.c) + target_compile_options(XNNPACK PRIVATE "-msimd128") + target_compile_options(XNNPACK PRIVATE "-mrelaxed-simd") + elseif(onnxruntime_ENABLE_WEBASSEMBLY_SIMD) list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c) target_compile_options(XNNPACK PRIVATE "-msimd128") endif() diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 15864a0198161..2be5117503d64 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -252,7 +252,7 @@ function(setup_mlas_source_for_windows) endfunction() if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") - if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) + if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD OR onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) file(GLOB_RECURSE mlas_platform_srcs "${MLAS_SRC_DIR}/wasm_simd/*.cpp" ) diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index f3afaf7033fd1..bfb73e14ce7a4 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -485,7 +485,9 @@ jsepDownload:_pp_") list(APPEND target_name_list "wasm") - if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) + if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) + list(APPEND target_name_list "relaxedsimd") + elseif (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) list(APPEND target_name_list "simd") endif() diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index db7dbed23a2d2..42f208315c627 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -507,6 +507,7 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--emsdk_version", default="4.0.4", help="Specify version of emsdk") parser.add_argument("--enable_wasm_simd", action="store_true", help="Enable WebAssembly SIMD") + parser.add_argument("--enable_wasm_relaxed_simd", action="store_true", help="Enable WebAssembly Relaxed SIMD") parser.add_argument("--enable_wasm_threads", action="store_true", help="Enable WebAssembly multi-threads support") parser.add_argument( @@ -1422,6 +1423,7 @@ def generate_build_tree( cmake_args.append("-Donnxruntime_DNNL_ACL_ROOT=" + args.dnnl_acl_root) if args.build_wasm: cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_SIMD=" + ("ON" if args.enable_wasm_simd else "OFF")) + cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF")) if args.use_migraphx: cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home) if args.use_rocm: From b655b7a1d42d54e1988c7b70c391709e28d4949c Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Thu, 5 Sep 2024 16:30:28 +0800 Subject: [PATCH 2/8] Add Wasm VNNI dispatch for QGemmU8X8 --- cmake/onnxruntime_mlas.cmake | 6 + onnxruntime/core/mlas/inc/mlas.h | 5 +- onnxruntime/core/mlas/lib/mlasi.h | 5 + onnxruntime/core/mlas/lib/qgemm.h | 8 + .../mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp | 568 ++++++++++++++++++ 5 files changed, 591 insertions(+), 1 deletion(-) create mode 100644 onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 2be5117503d64..714b14374f0fe 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -260,6 +260,12 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") ${mlas_platform_srcs} ${MLAS_SRC_DIR}/qgemm_kernel_wasmsimd.cpp ) + if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) + set(mlas_platform_srcs + ${mlas_platform_srcs} + ${MLAS_SRC_DIR}/qgemm_kernel_wasmrelaxedsimd.cpp + ) + endif() else() file(GLOB_RECURSE mlas_platform_srcs "${MLAS_SRC_DIR}/scalar/*.cpp" diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 1401e27ca77e5..a4052915bbae6 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -63,7 +63,10 @@ Module Name: #endif #if defined(__wasm__) #define MLAS_TARGET_WASM -#if defined(__wasm_simd128__) +#if defined(__wasm_relaxed_simd__) +#define MLAS_TARGET_WASM_RELAXED_SIMD +#define MLAS_TARGET_WASM_SIMD +#elif defined(__wasm_simd128__) #define MLAS_TARGET_WASM_SIMD #else #define MLAS_TARGET_WASM_SCALAR diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 0681b49252495..507cf35d1c3c4 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -996,9 +996,14 @@ extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSdot; extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchUmmla; extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSmmla; extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmSimd; +extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmRelaxedSimd; extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmQuantDispatchDefault; extern const MLAS_GEMM_QUANT_DISPATCH MlasGemm8X8DispatchPOWER10; +#if defined(MLAS_TARGET_WASM_RELAXED_SIMD) +extern bool HasUSDot(); +#endif + // // Symmetric quantized qgemm dispatch structure // diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h index bcd878efa681b..596267c3abdff 100644 --- a/onnxruntime/core/mlas/lib/qgemm.h +++ b/onnxruntime/core/mlas/lib/qgemm.h @@ -886,6 +886,14 @@ MlasGemmQuantGetDispatch( if(BIsSigned || !AIsSigned) { GemmQuantDispatch = &MlasGemmU8X8DispatchNeon; } +#elif defined(MLAS_TARGET_WASM_RELAXED_SIMD) + if (!AIsSigned) { + if (HasUSDot()) { + GemmQuantDispatch = &MlasGemmU8X8DispatchWasmRelaxedSimd; + } else { + GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd; + } + } #elif defined(MLAS_TARGET_WASM_SIMD) if (!AIsSigned) { GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd; diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp new file mode 100644 index 0000000000000..2c398eadead54 --- /dev/null +++ b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp @@ -0,0 +1,568 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + qgemm_kernel_wasmsimd.cpp + +Abstract: + + This module implements QGEMM kernel for WebAssembly SIMD128. + +--*/ + +#include "mlasi.h" +#include "qgemm.h" + +bool HasUSDot() { +// Check out-of-bounds behaviour of Relaxed Integer Dot Product with Accumulation with signed and unsigned input (e.g. vpdpbusd). + const v128_t int8_input = wasm_i8x16_const(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0); + const volatile v128_t xint8_input = wasm_i8x16_const(0, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, 0); // volatile to confuse Clang which otherwise ICE's + const v128_t xint8_output = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(int8_input, xint8_input, wasm_i8x16_const_splat(0)); + + const volatile v128_t overflow_input = wasm_i8x16_const(-128, -128, -128, -128, -128, -128, -1, -1, -1, -1, -128, -128, -1, -1, -1, -1); // volatile to confuse Clang which otherwise ICE's + const v128_t overflow_output = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(wasm_i8x16_const_splat(-128), overflow_input, wasm_i8x16_const_splat(0)); + return !wasm_v128_any_true(wasm_v128_or( + wasm_v128_xor(xint8_output, wasm_i32x4_const_splat(128)), + wasm_v128_xor(overflow_output, wasm_i32x4_const(-65536, -98048, -98048, -130560)))); +} + +// wasm implementation of "_mm_unpacklo_epi8" +v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpacklo_relaxed(v128_t a, v128_t b) { + return wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); +} + +// wasm implementation of "_mm_unpackhi_epi8" +v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpackhi_relaxed(v128_t a, v128_t b) { + return wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); +} + +// wasm implementation of "_mm_unpacklo_epi16" +v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i16x8_unpacklo_relaxed(v128_t a, v128_t b) { + return wasm_i8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23); +} + +// wasm implementation of "_mm_unpackhi_epi16" +v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i16x8_unpackhi_relaxed(v128_t a, v128_t b) { + return wasm_i8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31); +} + +struct MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD +{ + typedef uint8_t PackedAType; + typedef uint8_t PackedBType; + typedef uint8_t OffsetAType; + typedef int8_t OffsetBType; + + static constexpr size_t PackedK = 4; + static constexpr MLAS_GEMM_QUANT_STRIDES Strides{ 12, 128, 128 }; + static constexpr MLAS_GEMM_QUANT_STRIDES PackedStrides{0, 0, 0}; +}; + +constexpr size_t MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK; +constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::Strides; + +template<> +MLAS_FORCEINLINE +int32_t +MlasGemmQuantFixupZeroPointB( + int32_t ZeroPointB, + bool BIsSigned + ) +{ + if (!BIsSigned) { + ZeroPointB = MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::OffsetBType(ZeroPointB ^ 0x80); + } + + return ZeroPointB; +} + +template<> +void +MlasGemmQuantCopyPackA( + MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedAType* D, + const uint8_t* A, + size_t lda, + size_t CountM, + size_t CountK, + int32_t* RowSumBuffer, + bool AIsSigned + ) +{ + MLAS_UNREFERENCED_PARAMETER(AIsSigned); + const v128_t ZeroVector = wasm_i64x2_const(0, 0); + const v128_t OnesWordBroadcast = wasm_i16x8_splat(1); + uint8_t PaddedMatrixAData[8] = { 0 }; + + // + // Process a single row of matrix A in a loop. + // + + while (CountM > 0) { + + const uint8_t* a = A; + size_t k = CountK; + v128_t ReductionVector = ZeroVector; + + // + // Copy the source bytes to the packed buffer. + // + // The packed buffer has the same data ordering as the source bytes, + // but CountK is aligned up to a multiple of 4 to maintain 32-bit + // alignment. All extra bytes are zero-padded. + // + // Zero extend the source bytes to 16-bits and accumulate + // into an intermediate per-row + // accumulator. CountK cannot be greater than 128 to avoid overflowing + // these signed 16-bit accumulators. + // + + while (k >= 8) { + + v128_t Bytes = wasm_v128_load64_zero(&a[0]); + v128_t Words = wasm_i8x16_unpacklo_relaxed(Bytes, ZeroVector); + + ReductionVector = wasm_i16x8_add(ReductionVector, Words); + + wasm_v128_store64_lane(&D[0], Bytes, 0); + + a += 8; + D += 8; + k -= 8; + } + + if (k > 0) { + + // + // Copy the remaining bytes to the zero padded stack buffer. + // + + uint8_t* padded = PaddedMatrixAData; + uint8_t* padded_end = padded + k; + + do { + padded[0] = a[0]; + padded++; + a++; + } while (padded < padded_end); + + v128_t Bytes = wasm_v128_load64_zero(PaddedMatrixAData); + v128_t Words = wasm_i8x16_unpacklo_relaxed(Bytes, ZeroVector); + + ReductionVector = wasm_i16x8_add(ReductionVector, Words); + + // + // Copy quads of 8-bit values from the vector to the packed + // buffer and rotate the vector for the next iteration. + // + + for (size_t quads = (k + 3) / 4; quads > 0; quads--) { + *((int32_t*)D) = wasm_i32x4_extract_lane(Bytes, 0); + D += 4; + Bytes = wasm_i32x4_shuffle(Bytes, wasm_i32x4_splat(0), 1, 2, 3, 0); + } + } + + // + // Reduce the partial accumulators. + // + + ReductionVector = wasm_i32x4_dot_i16x8(ReductionVector, OnesWordBroadcast); + ReductionVector = wasm_i32x4_add(ReductionVector, + wasm_i32x4_shuffle(ReductionVector, wasm_i32x4_splat(0), 2, 3, 2, 3)); + ReductionVector = wasm_i32x4_add(ReductionVector, + wasm_i32x4_shuffle(ReductionVector, wasm_i32x4_splat(0), 1, 0, 1, 0)); + + *RowSumBuffer++ = wasm_i32x4_extract_lane(ReductionVector, 0); + + A += lda; + CountM -= 1; + } +} + + +MLAS_FORCEINLINE +void +MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd( + MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedBType* D, + v128_t BytesRow0, + v128_t BytesRow1, + v128_t BytesRow2, + v128_t BytesRow3, + v128_t BitFlipVector, + v128_t OnesByteBroadcast, + v128_t ColumnSums[2] +) +{ + v128_t PairsInterleaved0 = wasm_i8x16_unpacklo_relaxed(BytesRow0, BytesRow1); + v128_t PairsInterleaved1 = wasm_i8x16_unpacklo_relaxed(BytesRow2, BytesRow3); + + PairsInterleaved0 = wasm_v128_xor(PairsInterleaved0, BitFlipVector); + PairsInterleaved1 = wasm_v128_xor(PairsInterleaved1, BitFlipVector); + + v128_t QuadsInterleaved0 = wasm_i16x8_unpacklo_relaxed(PairsInterleaved0, PairsInterleaved1); + v128_t QuadsInterleaved1 = wasm_i16x8_unpackhi_relaxed(PairsInterleaved0, PairsInterleaved1); + + ColumnSums[0] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(QuadsInterleaved0, OnesByteBroadcast, ColumnSums[0]); + ColumnSums[1] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(QuadsInterleaved1, OnesByteBroadcast, ColumnSums[1]); + + wasm_v128_store(&D[0], QuadsInterleaved0); + wasm_v128_store(&D[16], QuadsInterleaved1); +} + +template<> +void +MlasGemmQuantCopyPackB( + MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedBType* D, + const uint8_t* B, + size_t ldb, + size_t CountN, + size_t CountK, + int32_t* ColumnSumBuffer, + bool BIsSigned + ) +{ + const v128_t OnesByteBroadcast = wasm_i8x16_splat(1); + const v128_t BitFlipVector = wasm_i32x4_splat(BIsSigned ? 0 : 0x80808080); + + // + // Process 8 columns of matrix B in a loop. + // + + while (CountN >= 8) { + + const uint8_t* b = B; + size_t k = CountK; + v128_t ColumnSums[2]; + + ColumnSums[0] = wasm_i64x2_const(0, 0); + ColumnSums[1] = wasm_i64x2_const(0, 0); + + // + // Interleave rows of matrix B and write to the packed buffer. + // + + while (k >= MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK) { + + v128_t BytesRow0 = wasm_v128_load64_zero(&b[0]); + v128_t BytesRow1 = wasm_v128_load64_zero(&b[ldb]); + v128_t BytesRow2 = wasm_v128_load64_zero(&b[ldb * 2]); + v128_t BytesRow3 = wasm_v128_load64_zero(&b[ldb * 3]); + + MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums); + + b += ldb * 4; + D += 32; + k -= 4; + } + + if (k > 0) { + + v128_t BytesRow0 = wasm_v128_load64_zero(&b[0]); + v128_t BytesRow1 = BitFlipVector; + v128_t BytesRow2 = BitFlipVector; + v128_t BytesRow3 = BitFlipVector; + + if (k >= 2) { + BytesRow1 = wasm_v128_load64_zero(&b[ldb]); + } + + if (k >= 3) { + BytesRow2 = wasm_v128_load64_zero(&b[ldb * 2]); + } + + MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums); + + D += 32; + } + + wasm_v128_store(&ColumnSumBuffer[0], ColumnSums[0]); + wasm_v128_store(&ColumnSumBuffer[4], ColumnSums[1]); + ColumnSumBuffer += 8; + + B += 8; + CountN -= 8; + } + + // + // Process the remaining columns of matrix B. + // + + if (CountN > 0) { + + const uint8_t* b = B; + size_t k = CountK; + v128_t ColumnSums[2]; + uint8_t PaddedMatrixBData[32]; + + wasm_v128_store(&PaddedMatrixBData[0], BitFlipVector); + wasm_v128_store(&PaddedMatrixBData[16], BitFlipVector); + + ColumnSums[0] = wasm_i64x2_const(0, 0); + ColumnSums[1] = wasm_i64x2_const(0, 0); + + // + // Interleave rows of matrix B using an intermediate zero padded stack + // buffer and write to the packed buffer. + // + + while (k >= MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK) { + + const uint8_t* bcopy = b; + uint8_t* padded = PaddedMatrixBData; + uint8_t* padded_end = padded + CountN; + + do { + padded[0] = bcopy[0]; + padded[8] = bcopy[ldb]; + padded[16] = bcopy[ldb * 2]; + padded[24] = bcopy[ldb * 3]; + padded++; + bcopy++; + } while (padded < padded_end); + + v128_t BytesRow0 = wasm_v128_load64_zero(&PaddedMatrixBData[0]); + v128_t BytesRow1 = wasm_v128_load64_zero(&PaddedMatrixBData[8]); + v128_t BytesRow2 = wasm_v128_load64_zero(&PaddedMatrixBData[16]); + v128_t BytesRow3 = wasm_v128_load64_zero(&PaddedMatrixBData[24]); + + MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums); + + b += ldb * 4; + D += 32; + k -= 4; + } + + if (k > 0) { + + const uint8_t* bcopy = b; + uint8_t* padded = PaddedMatrixBData; + uint8_t* padded_end = padded + CountN; + + wasm_v128_store(&PaddedMatrixBData[0], BitFlipVector); + wasm_v128_store(&PaddedMatrixBData[16], BitFlipVector); + + if (k == 3) { + do { + padded[0] = bcopy[0]; + padded[8] = bcopy[ldb]; + padded[16] = bcopy[ldb * 2]; + padded++; + bcopy++; + } while (padded < padded_end); + } else if (k == 2) { + do { + padded[0] = bcopy[0]; + padded[8] = bcopy[ldb]; + padded++; + bcopy++; + } while (padded < padded_end); + } else { + do { + padded[0] = bcopy[0]; + padded++; + bcopy++; + } while (padded < padded_end); + } + + v128_t BytesRow0 = wasm_v128_load64_zero(&PaddedMatrixBData[0]); + v128_t BytesRow1 = wasm_v128_load64_zero(&PaddedMatrixBData[8]); + v128_t BytesRow2 = wasm_v128_load64_zero(&PaddedMatrixBData[16]); + v128_t BytesRow3 = wasm_v128_load64_zero(&PaddedMatrixBData[24]); + + MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums); + } + + wasm_v128_store(&ColumnSumBuffer[0], ColumnSums[0]); + wasm_v128_store(&ColumnSumBuffer[4], ColumnSums[1]); + } +} + +MLAS_FORCEINLINE +void +MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd( + v128_t ABroadcast, + const uint8_t* B, + v128_t Accumulators[2] +) +{ + v128_t BElements0 = wasm_v128_load(&B[0]); + v128_t BElements1 = wasm_v128_load(&B[16]); + + Accumulators[0] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(BElements0, ABroadcast, Accumulators[0]); + Accumulators[1] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(BElements1, ABroadcast, Accumulators[1]); +} + + +template<> +size_t +MlasGemmQuantKernel( + const MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedAType* A, + const MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedBType* B, + int32_t* C, + size_t PackedCountK, + size_t CountM, + size_t CountN, + size_t ldc, + const int32_t* RowSumBuffer, + const int32_t* ColumnSumBuffer, + const int32_t* ZeroPointB, + bool ZeroMode + ) +{ + MLAS_UNREFERENCED_PARAMETER(CountM); + MLAS_UNREFERENCED_PARAMETER(ldc); + + while (CountN > 0) { + + v128_t Accumulators[2]; + + // + // Initialize the accumulators with the row and column sums. + // + + int32_t RowSumValue = RowSumBuffer[0]; + + if (ZeroPointB != nullptr) { + + int32_t ScaledRowSumBuffer[8]; + + for (size_t i = 0; i < 8; i++) { + ScaledRowSumBuffer[i] = RowSumValue * ZeroPointB[i]; + } + + ZeroPointB += 8; + + Accumulators[0] = wasm_v128_load(&ScaledRowSumBuffer[0]); + Accumulators[1] = wasm_v128_load(&ScaledRowSumBuffer[4]); + + } + else { + + Accumulators[0] = wasm_i32x4_splat(RowSumValue); + Accumulators[1] = Accumulators[0]; + } + + Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load(&ColumnSumBuffer[0])); + Accumulators[1] = wasm_i32x4_add(Accumulators[1], wasm_v128_load(&ColumnSumBuffer[4])); + ColumnSumBuffer += 8; + + // + // Broadcast each pair of 16-bit values from the matrix A and multiply + // with the pair of 16-bit values from matrix B, and add the 32-bit + // intermediate into the accumulator registers. + // + + const uint8_t* a = A; + size_t k = PackedCountK; + + while (k >= 4) { + + v128_t AElements = wasm_v128_load((v128_t*)a); + v128_t ABroadcast; + + ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 0, 0, 0, 0); + MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[0], Accumulators); + + ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 1, 1, 1, 1); + MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[32], Accumulators); + + ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 2, 2, 2, 2); + MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[64], Accumulators); + + ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 3, 3, 3, 3); + MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[96], Accumulators); + + a += 4 * 4; + B += 4 * 32; + k -= 4; + } + + while (k > 0) { + + v128_t ABroadcast = wasm_i32x4_splat(*((int32_t*)a)); + MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[0], Accumulators); + + a += 4; + B += 32; + k -= 1; + } + + // + // Output the accumulator block after optionally accumulating the values + // from matrix C. + // + + if (CountN >= 8) { + + if (!ZeroMode) { + Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load(&C[0])); + Accumulators[1] = wasm_i32x4_add(Accumulators[1], wasm_v128_load(&C[4])); + } + + wasm_v128_store(&C[0], Accumulators[0]); + wasm_v128_store(&C[4], Accumulators[1]); + + C += 8; + CountN -= 8; + + } + else { + + // + // Output the remaining partial output block. + // + + if ((CountN & 4) != 0) { + + if (!ZeroMode) { + Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load(&C[0])); + } + + wasm_v128_store(&C[0], Accumulators[0]); + C += 4; + + Accumulators[0] = Accumulators[1]; + } + + if ((CountN & 2) != 0) { + + if (!ZeroMode) { + Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load64_zero(&C[0])); + } + + wasm_v128_store64_lane(&C[0], Accumulators[0], 0); + C += 2; + + Accumulators[0] = wasm_i32x4_shuffle(Accumulators[0], wasm_i32x4_splat(0), 2, 3, 2, 3); + } + + if ((CountN & 1) != 0) { + + int32_t AccumulatorValue = wasm_i32x4_extract_lane(Accumulators[0], 0); + + if (!ZeroMode) { + AccumulatorValue += C[0]; + } + + C[0] = AccumulatorValue; + } + + CountN = 0; + } + } + + return 1; +} + +const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmRelaxedSimd = { + MlasGemmQuantOperation, + nullptr, + nullptr, + MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK, + 0, + 4 // multiple of kernel stride M +}; From b0857c49d100e1aeb13115c9d6bee77df50c1eea Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Tue, 5 Nov 2024 14:06:42 +0800 Subject: [PATCH 3/8] Use env.wasm.relaxedSimd to select relaxed SIMD wasm Now we don't need the file renaming trick. --- js/common/lib/env.ts | 9 +++++++ js/web/lib/backend-wasm.ts | 4 ++++ js/web/lib/wasm/wasm-factory.ts | 36 +++++++++++++++++++++++++++- js/web/lib/wasm/wasm-utils-import.ts | 9 +++++-- 4 files changed, 55 insertions(+), 3 deletions(-) diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts index d6d9f7fa48790..1a3c9556d243c 100644 --- a/js/common/lib/env.ts +++ b/js/common/lib/env.ts @@ -52,6 +52,15 @@ export declare namespace Env { */ simd?: boolean; + /** + * set or get a boolean value indicating whether to enable Relaxed SIMD. If set to false, Relaxed SIMD will be forcely disabled. + * + * This setting is available only when WebAssembly Relaxed SIMD feature is available in current context. + * + * @defaultValue `false` + */ + relaxedSimd?: boolean; + /** * set or get a boolean value indicating whether to enable trace. * diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts index 72b51d565896a..fa687f5a0e88a 100644 --- a/js/web/lib/backend-wasm.ts +++ b/js/web/lib/backend-wasm.ts @@ -25,6 +25,10 @@ export const initializeFlags = (): void => { ); } + if (typeof env.wasm.relaxedSimd !== 'boolean') { + env.wasm.relaxedSimd = false; + } + if (typeof env.wasm.proxy !== 'boolean') { env.wasm.proxy = false; } diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts index 0f49d25040409..e9743c7a3f308 100644 --- a/js/web/lib/wasm/wasm-factory.ts +++ b/js/web/lib/wasm/wasm-factory.ts @@ -64,6 +64,32 @@ const isSimdSupported = (): boolean => { } }; +const isRelaxedSimdSupported = (): boolean => { + try { + // Test for WebAssembly Relaxed SIMD capability (for both browsers and Node.js) + // This typed array is a WebAssembly program containing Relaxed SIMD instructions. + + // The binary data is generated from the following code by wat2wasm: + // (module + // (func (result v128) + // i32.const 1 + // i8x16.splat + // i32.const 2 + // i8x16.splat + // i8x16.relaxed_swizzle + // ) + // ) + return WebAssembly.validate( + new Uint8Array([ + 0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 15, 1, 13, 0, 65, 1, 253, 15, 65, 2, 253, + 15, 253, 128, 2, 11, + ]), + ); + } catch (e) { + return false; + } +}; + export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promise => { if (initialized) { return Promise.resolve(); @@ -80,11 +106,14 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis // wasm flags are already initialized const timeout = flags.initTimeout!; let numThreads = flags.numThreads!; + const relaxedSimd = flags.relaxedSimd!; // ensure SIMD is supported if (!isSimdSupported()) { throw new Error('WebAssembly SIMD is not supported in the current environment.'); } + // check if use relaxed simd + const useRelaxedSimd = relaxedSimd && isRelaxedSimdSupported(); // check if multi-threading is supported const multiThreadSupported = isMultiThreadSupported(); @@ -116,7 +145,12 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis const wasmPathOverride = (wasmPathOverrideFlag as URL)?.href ?? wasmPathOverrideFlag; const wasmBinaryOverride = flags.wasmBinary; - const [objectUrl, ortWasmFactory] = await importWasmModule(mjsPathOverride, wasmPrefixOverride, numThreads > 1); + const [objectUrl, ortWasmFactory] = await importWasmModule( + mjsPathOverride, + wasmPrefixOverride, + numThreads > 1, + useRelaxedSimd, + ); let isTimeout = false; diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts index a8e27f6f334bc..090ff9781b65f 100644 --- a/js/web/lib/wasm/wasm-utils-import.ts +++ b/js/web/lib/wasm/wasm-utils-import.ts @@ -234,13 +234,18 @@ export const importWasmModule = async ( urlOverride: string | undefined, prefixOverride: string | undefined, isMultiThreaded: boolean, + isRelaxedSimd: boolean, ): Promise<[undefined | string, EmscriptenModuleFactory]> => { if (!urlOverride && !prefixOverride && embeddedWasmModule && scriptSrc && isSameOrigin(scriptSrc)) { return [undefined, embeddedWasmModule]; } else { const wasmModuleFilename = !BUILD_DEFS.DISABLE_JSEP - ? 'ort-wasm-simd-threaded.jsep.mjs' - : 'ort-wasm-simd-threaded.mjs'; + ? isRelaxedSimd + ? 'ort-wasm-relaxedsimd-threaded.jsep.mjs' + : 'ort-wasm-simd-threaded.jsep.mjs' + : isRelaxedSimd + ? 'ort-wasm-relaxedsimd-threaded.mjs' + : 'ort-wasm-simd-threaded.mjs'; const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride); // need to preload if all of the following conditions are met: // 1. not in Node.js. From 1e5e01f82dbaf53c392fc09ecb64d41766fc2bff Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Mon, 11 Nov 2024 13:35:09 +0800 Subject: [PATCH 4/8] Update unittests and npm test unittest for relaxed SIMD needs node v21 or newer. Add wasm.relaxedSimd to test-runner-cli-args --- cmake/onnxruntime_unittests.cmake | 6 +++++- js/web/script/test-runner-cli-args.ts | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 87aee2a174fab..b603ad38d7acc 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -222,8 +222,12 @@ function(AddTest) else() set(TEST_NODE_FLAGS) + if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) + message(WARNING "Use system `node` to test Wasm relaxed SIMD. Please make sure to install node v21 or newer.") + set(NODE_EXECUTABLE node) + set(TEST_NODE_FLAGS) # prefer Node from emsdk so the version is more deterministic - if (DEFINED ENV{EMSDK_NODE}) + elseif (DEFINED ENV{EMSDK_NODE}) set(NODE_EXECUTABLE $ENV{EMSDK_NODE}) else() message(WARNING "EMSDK_NODE environment variable was not set. Falling back to system `node`.") diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts index 088a66b24f7bd..aa395f9db5658 100644 --- a/js/web/script/test-runner-cli-args.ts +++ b/js/web/script/test-runner-cli-args.ts @@ -305,6 +305,10 @@ function parseWasmFlags(args: minimist.ParsedArgs): Env.WebAssemblyFlags { if (typeof simd !== 'undefined' && typeof simd !== 'boolean') { throw new Error('Flag "wasm.simd"/"wasm-enable-simd" must be a boolean value'); } + const relaxedSimd = (wasm.relaxedSimd = parseBooleanArg(wasm.relaxedSimd)); + if (typeof relaxedSimd !== 'undefined' && typeof relaxedSimd !== 'boolean') { + throw new Error('Flag "wasm.relaxedSimd" must be a boolean value'); + } const proxy = (wasm.proxy = parseBooleanArg(wasm.proxy ?? args['wasm-enable-proxy'])); if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') { throw new Error('Flag "wasm.proxy"/"wasm-enable-proxy" must be a boolean value'); From b3703e771793c025e910e38c1c4afd8c78cc9238 Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Mon, 11 Nov 2024 13:56:34 +0800 Subject: [PATCH 5/8] Formatting --- tools/ci_build/build.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 42f208315c627..d32d287fc9eb3 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1423,7 +1423,9 @@ def generate_build_tree( cmake_args.append("-Donnxruntime_DNNL_ACL_ROOT=" + args.dnnl_acl_root) if args.build_wasm: cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_SIMD=" + ("ON" if args.enable_wasm_simd else "OFF")) - cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF")) + cmake_args.append( + "-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF") + ) if args.use_migraphx: cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home) if args.use_rocm: From 6ac7ca20bc8f89d1e76fc245c1b8da62540d4666 Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Mon, 2 Dec 2024 15:50:18 +0800 Subject: [PATCH 6/8] Add build flag check Raise error when enable_wasm_simd is False but enable_wasm_relaxed_simd is True. Fix comments. --- cmake/CMakeLists.txt | 1 + cmake/onnxruntime_mlas.cmake | 2 +- .../core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp | 4 ++-- tools/ci_build/build.py | 9 ++++++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 0b32d51884c05..57c5be5c59515 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -196,6 +196,7 @@ option(onnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO "Enable this option to turn on option(onnxruntime_ENABLE_WEBASSEMBLY_PROFILING "Enable this option to turn on WebAssembly profiling and preserve function names" OFF) option(onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL "Enable this option to allow WebAssembly to output optimized model" OFF) option(onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64 "Enable this option to allow WebAssembly to use 64bit memory" OFF) +option(onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD "Enable WebAssembly Relaxed SIMD" OFF) # Enable bitcode for iOS option(onnxruntime_ENABLE_BITCODE "Enable bitcode for iOS only" OFF) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 714b14374f0fe..e214bb71f0b03 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -252,7 +252,7 @@ function(setup_mlas_source_for_windows) endfunction() if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") - if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD OR onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) + if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) file(GLOB_RECURSE mlas_platform_srcs "${MLAS_SRC_DIR}/wasm_simd/*.cpp" ) diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp index 2c398eadead54..be0f63f6240fd 100644 --- a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp +++ b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp @@ -6,11 +6,11 @@ Licensed under the MIT License. Module Name: - qgemm_kernel_wasmsimd.cpp + qgemm_kernel_wasmrelaxedsimd.cpp Abstract: - This module implements QGEMM kernel for WebAssembly SIMD128. + This module implements QGEMM kernel for WebAssembly Relaxed SIMD128. --*/ diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index d32d287fc9eb3..3899767e1a2ee 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1423,9 +1423,12 @@ def generate_build_tree( cmake_args.append("-Donnxruntime_DNNL_ACL_ROOT=" + args.dnnl_acl_root) if args.build_wasm: cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_SIMD=" + ("ON" if args.enable_wasm_simd else "OFF")) - cmake_args.append( - "-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF") - ) + if args.enable_wasm_relaxed_simd: + if not args.enable_wasm_simd: + raise BuildError( + "Wasm Relaxed SIMD (--enable_wasm_relaxed_simd) is only available with Wasm SIMD (--enable_wasm_simd)." + ) + cmake_args += ["-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=ON"] if args.use_migraphx: cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home) if args.use_rocm: From 6b4e0739938fd1e8c2185b85a9f13ddd50508152 Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Mon, 2 Dec 2024 16:15:38 +0800 Subject: [PATCH 7/8] Check i32x4.relaxed_dot_i8x16_i7x16_add_s exactly --- js/web/lib/wasm/wasm-factory.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts index e9743c7a3f308..5d872e2037ee8 100644 --- a/js/web/lib/wasm/wasm-factory.ts +++ b/js/web/lib/wasm/wasm-factory.ts @@ -76,13 +76,15 @@ const isRelaxedSimdSupported = (): boolean => { // i8x16.splat // i32.const 2 // i8x16.splat - // i8x16.relaxed_swizzle + // i32.const 3 + // i8x16.splat + // i32x4.relaxed_dot_i8x16_i7x16_add_s // ) // ) return WebAssembly.validate( new Uint8Array([ - 0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 15, 1, 13, 0, 65, 1, 253, 15, 65, 2, 253, - 15, 253, 128, 2, 11, + 0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 19, 1, 17, 0, 65, 1, 253, 15, 65, 2, 253, + 15, 65, 3, 253, 15, 253, 147, 2, 11, ]), ); } catch (e) { From 2e222c8c90b2ce125625de5f40f4920218388201 Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Tue, 11 Mar 2025 15:07:31 +0800 Subject: [PATCH 8/8] keep API unchanged. Drive-by: Rebase Fix comments Remove unused wasm functions --- cmake/onnxruntime_unittests.cmake | 1 - js/common/lib/env.ts | 9 ----- js/web/lib/backend-wasm.ts | 4 -- js/web/lib/wasm/wasm-factory.ts | 38 +------------------ js/web/lib/wasm/wasm-utils-import.ts | 9 +---- js/web/script/test-runner-cli-args.ts | 4 -- .../mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp | 7 +--- 7 files changed, 4 insertions(+), 68 deletions(-) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index b603ad38d7acc..2e2fb6d858dfc 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -225,7 +225,6 @@ function(AddTest) if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD) message(WARNING "Use system `node` to test Wasm relaxed SIMD. Please make sure to install node v21 or newer.") set(NODE_EXECUTABLE node) - set(TEST_NODE_FLAGS) # prefer Node from emsdk so the version is more deterministic elseif (DEFINED ENV{EMSDK_NODE}) set(NODE_EXECUTABLE $ENV{EMSDK_NODE}) diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts index 1a3c9556d243c..d6d9f7fa48790 100644 --- a/js/common/lib/env.ts +++ b/js/common/lib/env.ts @@ -52,15 +52,6 @@ export declare namespace Env { */ simd?: boolean; - /** - * set or get a boolean value indicating whether to enable Relaxed SIMD. If set to false, Relaxed SIMD will be forcely disabled. - * - * This setting is available only when WebAssembly Relaxed SIMD feature is available in current context. - * - * @defaultValue `false` - */ - relaxedSimd?: boolean; - /** * set or get a boolean value indicating whether to enable trace. * diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts index fa687f5a0e88a..72b51d565896a 100644 --- a/js/web/lib/backend-wasm.ts +++ b/js/web/lib/backend-wasm.ts @@ -25,10 +25,6 @@ export const initializeFlags = (): void => { ); } - if (typeof env.wasm.relaxedSimd !== 'boolean') { - env.wasm.relaxedSimd = false; - } - if (typeof env.wasm.proxy !== 'boolean') { env.wasm.proxy = false; } diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts index 5d872e2037ee8..0f49d25040409 100644 --- a/js/web/lib/wasm/wasm-factory.ts +++ b/js/web/lib/wasm/wasm-factory.ts @@ -64,34 +64,6 @@ const isSimdSupported = (): boolean => { } }; -const isRelaxedSimdSupported = (): boolean => { - try { - // Test for WebAssembly Relaxed SIMD capability (for both browsers and Node.js) - // This typed array is a WebAssembly program containing Relaxed SIMD instructions. - - // The binary data is generated from the following code by wat2wasm: - // (module - // (func (result v128) - // i32.const 1 - // i8x16.splat - // i32.const 2 - // i8x16.splat - // i32.const 3 - // i8x16.splat - // i32x4.relaxed_dot_i8x16_i7x16_add_s - // ) - // ) - return WebAssembly.validate( - new Uint8Array([ - 0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 19, 1, 17, 0, 65, 1, 253, 15, 65, 2, 253, - 15, 65, 3, 253, 15, 253, 147, 2, 11, - ]), - ); - } catch (e) { - return false; - } -}; - export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promise => { if (initialized) { return Promise.resolve(); @@ -108,14 +80,11 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis // wasm flags are already initialized const timeout = flags.initTimeout!; let numThreads = flags.numThreads!; - const relaxedSimd = flags.relaxedSimd!; // ensure SIMD is supported if (!isSimdSupported()) { throw new Error('WebAssembly SIMD is not supported in the current environment.'); } - // check if use relaxed simd - const useRelaxedSimd = relaxedSimd && isRelaxedSimdSupported(); // check if multi-threading is supported const multiThreadSupported = isMultiThreadSupported(); @@ -147,12 +116,7 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis const wasmPathOverride = (wasmPathOverrideFlag as URL)?.href ?? wasmPathOverrideFlag; const wasmBinaryOverride = flags.wasmBinary; - const [objectUrl, ortWasmFactory] = await importWasmModule( - mjsPathOverride, - wasmPrefixOverride, - numThreads > 1, - useRelaxedSimd, - ); + const [objectUrl, ortWasmFactory] = await importWasmModule(mjsPathOverride, wasmPrefixOverride, numThreads > 1); let isTimeout = false; diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts index 090ff9781b65f..a8e27f6f334bc 100644 --- a/js/web/lib/wasm/wasm-utils-import.ts +++ b/js/web/lib/wasm/wasm-utils-import.ts @@ -234,18 +234,13 @@ export const importWasmModule = async ( urlOverride: string | undefined, prefixOverride: string | undefined, isMultiThreaded: boolean, - isRelaxedSimd: boolean, ): Promise<[undefined | string, EmscriptenModuleFactory]> => { if (!urlOverride && !prefixOverride && embeddedWasmModule && scriptSrc && isSameOrigin(scriptSrc)) { return [undefined, embeddedWasmModule]; } else { const wasmModuleFilename = !BUILD_DEFS.DISABLE_JSEP - ? isRelaxedSimd - ? 'ort-wasm-relaxedsimd-threaded.jsep.mjs' - : 'ort-wasm-simd-threaded.jsep.mjs' - : isRelaxedSimd - ? 'ort-wasm-relaxedsimd-threaded.mjs' - : 'ort-wasm-simd-threaded.mjs'; + ? 'ort-wasm-simd-threaded.jsep.mjs' + : 'ort-wasm-simd-threaded.mjs'; const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride); // need to preload if all of the following conditions are met: // 1. not in Node.js. diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts index aa395f9db5658..088a66b24f7bd 100644 --- a/js/web/script/test-runner-cli-args.ts +++ b/js/web/script/test-runner-cli-args.ts @@ -305,10 +305,6 @@ function parseWasmFlags(args: minimist.ParsedArgs): Env.WebAssemblyFlags { if (typeof simd !== 'undefined' && typeof simd !== 'boolean') { throw new Error('Flag "wasm.simd"/"wasm-enable-simd" must be a boolean value'); } - const relaxedSimd = (wasm.relaxedSimd = parseBooleanArg(wasm.relaxedSimd)); - if (typeof relaxedSimd !== 'undefined' && typeof relaxedSimd !== 'boolean') { - throw new Error('Flag "wasm.relaxedSimd" must be a boolean value'); - } const proxy = (wasm.proxy = parseBooleanArg(wasm.proxy ?? args['wasm-enable-proxy'])); if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') { throw new Error('Flag "wasm.proxy"/"wasm-enable-proxy" must be a boolean value'); diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp index be0f63f6240fd..a3a0fa758d377 100644 --- a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp +++ b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp @@ -18,7 +18,7 @@ Module Name: #include "qgemm.h" bool HasUSDot() { -// Check out-of-bounds behaviour of Relaxed Integer Dot Product with Accumulation with signed and unsigned input (e.g. vpdpbusd). +// Check out-of-bounds behavior of Relaxed Integer Dot Product with Accumulation with signed and unsigned input (e.g. vpdpbusd). const v128_t int8_input = wasm_i8x16_const(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0); const volatile v128_t xint8_input = wasm_i8x16_const(0, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, 0); // volatile to confuse Clang which otherwise ICE's const v128_t xint8_output = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(int8_input, xint8_input, wasm_i8x16_const_splat(0)); @@ -35,11 +35,6 @@ v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpacklo_relax return wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); } -// wasm implementation of "_mm_unpackhi_epi8" -v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpackhi_relaxed(v128_t a, v128_t b) { - return wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); -} - // wasm implementation of "_mm_unpacklo_epi16" v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i16x8_unpacklo_relaxed(v128_t a, v128_t b) { return wasm_i8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);