From 5f1618591bacb79658b2d87b64cd0ec99117e216 Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Wed, 21 Aug 2024 13:28:35 +0800
Subject: [PATCH 1/8] Enable relaxed simd build

Build with --enable_wasm_relaxed_simd.
A trick here is to rename mjs file like:
cp ../../../build_wasm/Release/ort-wasm-relaxedsimd-threaded.mjs ./ort-wasm-simd-threaded.mjs
cp ../../../build_wasm/Release/ort-wasm-relaxedsimd-threaded.wasm .
---
 cmake/adjust_global_compile_flags.cmake | 5 ++++-
 cmake/external/xnnpack.cmake            | 7 ++++++-
 cmake/onnxruntime_mlas.cmake            | 2 +-
 cmake/onnxruntime_webassembly.cmake     | 4 +++-
 tools/ci_build/build.py                 | 2 ++
 5 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 2aa83e9e3ee96..2ad9ba55f3269 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -35,7 +35,10 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set(CMAKE_CXX_FLAGS_DEBUG "-g2")
   endif()
 
-  if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
+  if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
+    string(APPEND CMAKE_C_FLAGS " -msimd128 -mrelaxed-simd")
+    string(APPEND CMAKE_CXX_FLAGS " -msimd128 -mrelaxed-simd")
+  elseif (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
     string(APPEND CMAKE_C_FLAGS " -msimd128")
     string(APPEND CMAKE_CXX_FLAGS " -msimd128")
   endif()
diff --git a/cmake/external/xnnpack.cmake b/cmake/external/xnnpack.cmake
index 02ef9a198a803..d0ab770053be1 100644
--- a/cmake/external/xnnpack.cmake
+++ b/cmake/external/xnnpack.cmake
@@ -143,7 +143,12 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/scalar.c)
   list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasm.c)
 
-  if(onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
+  if(onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
+    list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c)
+    list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmrelaxedsimd.c)
+    target_compile_options(XNNPACK PRIVATE "-msimd128")
+    target_compile_options(XNNPACK PRIVATE "-mrelaxed-simd")
+  elseif(onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
     list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c)
     target_compile_options(XNNPACK PRIVATE "-msimd128")
   endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 15864a0198161..2be5117503d64 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -252,7 +252,7 @@ function(setup_mlas_source_for_windows)
 endfunction()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-  if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
+  if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD OR onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
     file(GLOB_RECURSE mlas_platform_srcs
       "${MLAS_SRC_DIR}/wasm_simd/*.cpp"
     )
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index f3afaf7033fd1..bfb73e14ce7a4 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -485,7 +485,9 @@ jsepDownload:_pp_")
 
   list(APPEND target_name_list  "wasm")
 
-  if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
+  if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
+    list(APPEND target_name_list  "relaxedsimd")
+  elseif (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
     list(APPEND target_name_list  "simd")
   endif()
 
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index db7dbed23a2d2..42f208315c627 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -507,6 +507,7 @@ def convert_arg_line_to_args(self, arg_line):
     parser.add_argument("--emsdk_version", default="4.0.4", help="Specify version of emsdk")
 
     parser.add_argument("--enable_wasm_simd", action="store_true", help="Enable WebAssembly SIMD")
+    parser.add_argument("--enable_wasm_relaxed_simd", action="store_true", help="Enable WebAssembly Relaxed SIMD")
     parser.add_argument("--enable_wasm_threads", action="store_true", help="Enable WebAssembly multi-threads support")
 
     parser.add_argument(
@@ -1422,6 +1423,7 @@ def generate_build_tree(
         cmake_args.append("-Donnxruntime_DNNL_ACL_ROOT=" + args.dnnl_acl_root)
     if args.build_wasm:
         cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_SIMD=" + ("ON" if args.enable_wasm_simd else "OFF"))
+        cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF"))
     if args.use_migraphx:
         cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home)
     if args.use_rocm:

From b655b7a1d42d54e1988c7b70c391709e28d4949c Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Thu, 5 Sep 2024 16:30:28 +0800
Subject: [PATCH 2/8] Add Wasm VNNI dispatch for QGemmU8X8

---
 cmake/onnxruntime_mlas.cmake                  |   6 +
 onnxruntime/core/mlas/inc/mlas.h              |   5 +-
 onnxruntime/core/mlas/lib/mlasi.h             |   5 +
 onnxruntime/core/mlas/lib/qgemm.h             |   8 +
 .../mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp | 568 ++++++++++++++++++
 5 files changed, 591 insertions(+), 1 deletion(-)
 create mode 100644 onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 2be5117503d64..714b14374f0fe 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -260,6 +260,12 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
       ${mlas_platform_srcs}
       ${MLAS_SRC_DIR}/qgemm_kernel_wasmsimd.cpp
     )
+    if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
+      set(mlas_platform_srcs
+        ${mlas_platform_srcs}
+        ${MLAS_SRC_DIR}/qgemm_kernel_wasmrelaxedsimd.cpp
+      )
+    endif()
   else()
     file(GLOB_RECURSE mlas_platform_srcs
       "${MLAS_SRC_DIR}/scalar/*.cpp"
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 1401e27ca77e5..a4052915bbae6 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -63,7 +63,10 @@ Module Name:
 #endif
 #if defined(__wasm__)
 #define MLAS_TARGET_WASM
-#if defined(__wasm_simd128__)
+#if defined(__wasm_relaxed_simd__)
+#define MLAS_TARGET_WASM_RELAXED_SIMD
+#define MLAS_TARGET_WASM_SIMD
+#elif defined(__wasm_simd128__)
 #define MLAS_TARGET_WASM_SIMD
 #else
 #define MLAS_TARGET_WASM_SCALAR
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 0681b49252495..507cf35d1c3c4 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -996,9 +996,14 @@ extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSdot;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchUmmla;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSmmla;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmSimd;
+extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmRelaxedSimd;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmQuantDispatchDefault;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemm8X8DispatchPOWER10;
 
+#if defined(MLAS_TARGET_WASM_RELAXED_SIMD)
+extern bool HasUSDot();
+#endif
+
 //
 // Symmetric quantized qgemm dispatch structure
 //
diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h
index bcd878efa681b..596267c3abdff 100644
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@@ -886,6 +886,14 @@ MlasGemmQuantGetDispatch(
     if(BIsSigned || !AIsSigned) {
         GemmQuantDispatch = &MlasGemmU8X8DispatchNeon;
     }
+#elif defined(MLAS_TARGET_WASM_RELAXED_SIMD)
+    if (!AIsSigned) {
+        if (HasUSDot()) {
+          GemmQuantDispatch = &MlasGemmU8X8DispatchWasmRelaxedSimd;
+        } else {
+          GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd;
+        }
+    }
 #elif defined(MLAS_TARGET_WASM_SIMD)
     if (!AIsSigned) {
         GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd;
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
new file mode 100644
index 0000000000000..2c398eadead54
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
@@ -0,0 +1,568 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    qgemm_kernel_wasmsimd.cpp
+
+Abstract:
+
+    This module implements QGEMM kernel for WebAssembly SIMD128.
+
+--*/
+
+#include "mlasi.h"
+#include "qgemm.h"
+
+bool HasUSDot() {
+// Check out-of-bounds behaviour of Relaxed Integer Dot Product with Accumulation with signed and unsigned input (e.g. vpdpbusd).
+      const v128_t int8_input = wasm_i8x16_const(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0);
+      const volatile v128_t xint8_input = wasm_i8x16_const(0, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, 0);  // volatile to confuse Clang which otherwise ICE's
+      const v128_t xint8_output = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(int8_input, xint8_input, wasm_i8x16_const_splat(0));
+
+      const volatile v128_t overflow_input = wasm_i8x16_const(-128, -128, -128, -128, -128, -128, -1, -1, -1, -1, -128, -128, -1, -1, -1, -1);  // volatile to confuse Clang which otherwise ICE's
+      const v128_t overflow_output = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(wasm_i8x16_const_splat(-128), overflow_input, wasm_i8x16_const_splat(0));
+      return !wasm_v128_any_true(wasm_v128_or(
+        wasm_v128_xor(xint8_output, wasm_i32x4_const_splat(128)),
+        wasm_v128_xor(overflow_output, wasm_i32x4_const(-65536, -98048, -98048, -130560))));
+}
+
+// wasm implementation of "_mm_unpacklo_epi8"
+v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpacklo_relaxed(v128_t a, v128_t b) {
+    return wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+}
+
+// wasm implementation of "_mm_unpackhi_epi8"
+v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpackhi_relaxed(v128_t a, v128_t b) {
+    return wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+}
+
+// wasm implementation of "_mm_unpacklo_epi16"
+v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i16x8_unpacklo_relaxed(v128_t a, v128_t b) {
+    return wasm_i8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
+}
+
+// wasm implementation of "_mm_unpackhi_epi16"
+v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i16x8_unpackhi_relaxed(v128_t a, v128_t b) {
+    return wasm_i8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31);
+}
+
+struct MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD
+{
+    typedef uint8_t PackedAType;
+    typedef uint8_t PackedBType;
+    typedef uint8_t OffsetAType;
+    typedef int8_t OffsetBType;
+
+    static constexpr size_t PackedK = 4;
+    static constexpr MLAS_GEMM_QUANT_STRIDES Strides{ 12, 128, 128 };
+    static constexpr MLAS_GEMM_QUANT_STRIDES PackedStrides{0, 0, 0};
+};
+
+constexpr size_t MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK;
+constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::Strides;
+
+template<>
+MLAS_FORCEINLINE
+int32_t
+MlasGemmQuantFixupZeroPointB<MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD>(
+    int32_t ZeroPointB,
+    bool BIsSigned
+    )
+{
+    if (!BIsSigned) {
+        ZeroPointB = MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::OffsetBType(ZeroPointB ^ 0x80);
+    }
+
+    return ZeroPointB;
+}
+
+template<>
+void
+MlasGemmQuantCopyPackA<MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD>(
+    MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedAType* D,
+    const uint8_t* A,
+    size_t lda,
+    size_t CountM,
+    size_t CountK,
+    int32_t* RowSumBuffer,
+    bool AIsSigned
+    )
+{
+    MLAS_UNREFERENCED_PARAMETER(AIsSigned);
+    const v128_t ZeroVector = wasm_i64x2_const(0, 0);
+    const v128_t OnesWordBroadcast = wasm_i16x8_splat(1);
+    uint8_t PaddedMatrixAData[8] = { 0 };
+
+    //
+    // Process a single row of matrix A in a loop.
+    //
+
+    while (CountM > 0) {
+
+        const uint8_t* a = A;
+        size_t k = CountK;
+        v128_t ReductionVector = ZeroVector;
+
+        //
+        // Copy the source bytes to the packed buffer.
+        //
+        // The packed buffer has the same data ordering as the source bytes,
+        // but CountK is aligned up to a multiple of 4 to maintain 32-bit
+        // alignment. All extra bytes are zero-padded.
+        //
+        // Zero extend the source bytes to 16-bits and accumulate
+        // into an intermediate per-row
+        // accumulator. CountK cannot be greater than 128 to avoid overflowing
+        // these signed 16-bit accumulators.
+        //
+
+        while (k >= 8) {
+
+            v128_t Bytes = wasm_v128_load64_zero(&a[0]);
+            v128_t Words = wasm_i8x16_unpacklo_relaxed(Bytes, ZeroVector);
+
+            ReductionVector = wasm_i16x8_add(ReductionVector, Words);
+
+            wasm_v128_store64_lane(&D[0], Bytes, 0);
+
+            a += 8;
+            D += 8;
+            k -= 8;
+        }
+
+        if (k > 0) {
+
+            //
+            // Copy the remaining bytes to the zero padded stack buffer.
+            //
+
+            uint8_t* padded = PaddedMatrixAData;
+            uint8_t* padded_end = padded + k;
+
+            do {
+                padded[0] = a[0];
+                padded++;
+                a++;
+            } while (padded < padded_end);
+
+            v128_t Bytes = wasm_v128_load64_zero(PaddedMatrixAData);
+            v128_t Words = wasm_i8x16_unpacklo_relaxed(Bytes, ZeroVector);
+
+            ReductionVector = wasm_i16x8_add(ReductionVector, Words);
+
+            //
+            // Copy quads of 8-bit values from the vector to the packed
+            // buffer and rotate the vector for the next iteration.
+            //
+
+            for (size_t quads = (k + 3) / 4; quads > 0; quads--) {
+                *((int32_t*)D) = wasm_i32x4_extract_lane(Bytes, 0);
+                D += 4;
+                Bytes = wasm_i32x4_shuffle(Bytes, wasm_i32x4_splat(0), 1, 2, 3, 0);
+            }
+        }
+
+        //
+        // Reduce the partial accumulators.
+        //
+
+        ReductionVector = wasm_i32x4_dot_i16x8(ReductionVector, OnesWordBroadcast);
+        ReductionVector = wasm_i32x4_add(ReductionVector,
+                                         wasm_i32x4_shuffle(ReductionVector, wasm_i32x4_splat(0), 2, 3, 2, 3));
+        ReductionVector = wasm_i32x4_add(ReductionVector,
+                                         wasm_i32x4_shuffle(ReductionVector, wasm_i32x4_splat(0), 1, 0, 1, 0));
+
+        *RowSumBuffer++ = wasm_i32x4_extract_lane(ReductionVector, 0);
+
+        A += lda;
+        CountM -= 1;
+    }
+}
+
+
+MLAS_FORCEINLINE
+void
+MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(
+    MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedBType* D,
+    v128_t BytesRow0,
+    v128_t BytesRow1,
+    v128_t BytesRow2,
+    v128_t BytesRow3,
+    v128_t BitFlipVector,
+    v128_t OnesByteBroadcast,
+    v128_t ColumnSums[2]
+)
+{
+    v128_t PairsInterleaved0 = wasm_i8x16_unpacklo_relaxed(BytesRow0, BytesRow1);
+    v128_t PairsInterleaved1 = wasm_i8x16_unpacklo_relaxed(BytesRow2, BytesRow3);
+
+    PairsInterleaved0 = wasm_v128_xor(PairsInterleaved0, BitFlipVector);
+    PairsInterleaved1 = wasm_v128_xor(PairsInterleaved1, BitFlipVector);
+
+    v128_t QuadsInterleaved0 = wasm_i16x8_unpacklo_relaxed(PairsInterleaved0, PairsInterleaved1);
+    v128_t QuadsInterleaved1 = wasm_i16x8_unpackhi_relaxed(PairsInterleaved0, PairsInterleaved1);
+
+    ColumnSums[0] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(QuadsInterleaved0, OnesByteBroadcast, ColumnSums[0]);
+    ColumnSums[1] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(QuadsInterleaved1, OnesByteBroadcast, ColumnSums[1]);
+
+    wasm_v128_store(&D[0], QuadsInterleaved0);
+    wasm_v128_store(&D[16], QuadsInterleaved1);
+}
+
+template<>
+void
+MlasGemmQuantCopyPackB<MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD>(
+    MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedBType* D,
+    const uint8_t* B,
+    size_t ldb,
+    size_t CountN,
+    size_t CountK,
+    int32_t* ColumnSumBuffer,
+    bool BIsSigned
+    )
+{
+    const v128_t OnesByteBroadcast = wasm_i8x16_splat(1);
+    const v128_t BitFlipVector = wasm_i32x4_splat(BIsSigned ? 0 : 0x80808080);
+
+    //
+    // Process 8 columns of matrix B in a loop.
+    //
+
+    while (CountN >= 8) {
+
+        const uint8_t* b = B;
+        size_t k = CountK;
+        v128_t ColumnSums[2];
+
+        ColumnSums[0] = wasm_i64x2_const(0, 0);
+        ColumnSums[1] = wasm_i64x2_const(0, 0);
+
+        //
+        // Interleave rows of matrix B and write to the packed buffer.
+        //
+
+        while (k >= MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK) {
+
+            v128_t BytesRow0 = wasm_v128_load64_zero(&b[0]);
+            v128_t BytesRow1 = wasm_v128_load64_zero(&b[ldb]);
+            v128_t BytesRow2 = wasm_v128_load64_zero(&b[ldb * 2]);
+            v128_t BytesRow3 = wasm_v128_load64_zero(&b[ldb * 3]);
+
+            MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums);
+
+            b += ldb * 4;
+            D += 32;
+            k -= 4;
+        }
+
+        if (k > 0) {
+
+            v128_t BytesRow0 = wasm_v128_load64_zero(&b[0]);
+            v128_t BytesRow1 = BitFlipVector;
+            v128_t BytesRow2 = BitFlipVector;
+            v128_t BytesRow3 = BitFlipVector;
+
+            if (k >= 2) {
+                BytesRow1 = wasm_v128_load64_zero(&b[ldb]);
+            }
+
+            if (k >= 3) {
+                BytesRow2 = wasm_v128_load64_zero(&b[ldb * 2]);
+            }
+
+            MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums);
+
+            D += 32;
+        }
+
+        wasm_v128_store(&ColumnSumBuffer[0], ColumnSums[0]);
+        wasm_v128_store(&ColumnSumBuffer[4], ColumnSums[1]);
+        ColumnSumBuffer += 8;
+
+        B += 8;
+        CountN -= 8;
+    }
+
+    //
+    // Process the remaining columns of matrix B.
+    //
+
+    if (CountN > 0) {
+
+        const uint8_t* b = B;
+        size_t k = CountK;
+        v128_t ColumnSums[2];
+        uint8_t PaddedMatrixBData[32];
+
+        wasm_v128_store(&PaddedMatrixBData[0], BitFlipVector);
+        wasm_v128_store(&PaddedMatrixBData[16], BitFlipVector);
+
+        ColumnSums[0] = wasm_i64x2_const(0, 0);
+        ColumnSums[1] = wasm_i64x2_const(0, 0);
+
+        //
+        // Interleave rows of matrix B using an intermediate zero padded stack
+        // buffer and write to the packed buffer.
+        //
+
+        while (k >= MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK) {
+
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;
+
+            do {
+                padded[0] = bcopy[0];
+                padded[8] = bcopy[ldb];
+                padded[16] = bcopy[ldb * 2];
+                padded[24] = bcopy[ldb * 3];
+                padded++;
+                bcopy++;
+            } while (padded < padded_end);
+
+            v128_t BytesRow0 = wasm_v128_load64_zero(&PaddedMatrixBData[0]);
+            v128_t BytesRow1 = wasm_v128_load64_zero(&PaddedMatrixBData[8]);
+            v128_t BytesRow2 = wasm_v128_load64_zero(&PaddedMatrixBData[16]);
+            v128_t BytesRow3 = wasm_v128_load64_zero(&PaddedMatrixBData[24]);
+
+            MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums);
+
+            b += ldb * 4;
+            D += 32;
+            k -= 4;
+        }
+
+        if (k > 0) {
+
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;
+
+            wasm_v128_store(&PaddedMatrixBData[0], BitFlipVector);
+            wasm_v128_store(&PaddedMatrixBData[16], BitFlipVector);
+
+            if (k == 3) {
+              do {
+                  padded[0] = bcopy[0];
+                  padded[8] = bcopy[ldb];
+                  padded[16] = bcopy[ldb * 2];
+                  padded++;
+                  bcopy++;
+              } while (padded < padded_end);
+            } else if (k == 2) {
+              do {
+                  padded[0] = bcopy[0];
+                  padded[8] = bcopy[ldb];
+                  padded++;
+                  bcopy++;
+              } while (padded < padded_end);
+            } else {
+              do {
+                  padded[0] = bcopy[0];
+                  padded++;
+                  bcopy++;
+              } while (padded < padded_end);
+            }
+
+            v128_t BytesRow0 = wasm_v128_load64_zero(&PaddedMatrixBData[0]);
+            v128_t BytesRow1 = wasm_v128_load64_zero(&PaddedMatrixBData[8]);
+            v128_t BytesRow2 = wasm_v128_load64_zero(&PaddedMatrixBData[16]);
+            v128_t BytesRow3 = wasm_v128_load64_zero(&PaddedMatrixBData[24]);
+
+            MlasGemmU8X8CopyPackBProcessWasmRelaxedSimd(D, BytesRow0, BytesRow1, BytesRow2, BytesRow3, BitFlipVector, OnesByteBroadcast, ColumnSums);
+        }
+
+        wasm_v128_store(&ColumnSumBuffer[0], ColumnSums[0]);
+        wasm_v128_store(&ColumnSumBuffer[4], ColumnSums[1]);
+    }
+}
+
+MLAS_FORCEINLINE
+void
+MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(
+    v128_t ABroadcast,
+    const uint8_t* B,
+    v128_t Accumulators[2]
+)
+{
+    v128_t BElements0 = wasm_v128_load(&B[0]);
+    v128_t BElements1 = wasm_v128_load(&B[16]);
+
+    Accumulators[0] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(BElements0, ABroadcast, Accumulators[0]);
+    Accumulators[1] = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(BElements1, ABroadcast, Accumulators[1]);
+}
+
+
+template<>
+size_t
+MlasGemmQuantKernel<MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD>(
+    const MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedAType* A,
+    const MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedBType* B,
+    int32_t* C,
+    size_t PackedCountK,
+    size_t CountM,
+    size_t CountN,
+    size_t ldc,
+    const int32_t* RowSumBuffer,
+    const int32_t* ColumnSumBuffer,
+    const int32_t* ZeroPointB,
+    bool ZeroMode
+    )
+{
+    MLAS_UNREFERENCED_PARAMETER(CountM);
+    MLAS_UNREFERENCED_PARAMETER(ldc);
+
+    while (CountN > 0) {
+
+        v128_t Accumulators[2];
+
+        //
+        // Initialize the accumulators with the row and column sums.
+        //
+
+        int32_t RowSumValue = RowSumBuffer[0];
+
+        if (ZeroPointB != nullptr) {
+
+            int32_t ScaledRowSumBuffer[8];
+
+            for (size_t i = 0; i < 8; i++) {
+                ScaledRowSumBuffer[i] = RowSumValue * ZeroPointB[i];
+            }
+
+            ZeroPointB += 8;
+
+            Accumulators[0] = wasm_v128_load(&ScaledRowSumBuffer[0]);
+            Accumulators[1] = wasm_v128_load(&ScaledRowSumBuffer[4]);
+
+        }
+        else {
+
+            Accumulators[0] = wasm_i32x4_splat(RowSumValue);
+            Accumulators[1] = Accumulators[0];
+        }
+
+        Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load(&ColumnSumBuffer[0]));
+        Accumulators[1] = wasm_i32x4_add(Accumulators[1], wasm_v128_load(&ColumnSumBuffer[4]));
+        ColumnSumBuffer += 8;
+
+        //
+        // Broadcast each pair of 16-bit values from the matrix A and multiply
+        // with the pair of 16-bit values from matrix B, and add the 32-bit
+        // intermediate into the accumulator registers.
+        //
+
+        const uint8_t* a = A;
+        size_t k = PackedCountK;
+
+        while (k >= 4) {
+
+            v128_t AElements = wasm_v128_load((v128_t*)a);
+            v128_t ABroadcast;
+
+            ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 0, 0, 0, 0);
+            MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[0], Accumulators);
+
+            ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 1, 1, 1, 1);
+            MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[32], Accumulators);
+
+            ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 2, 2, 2, 2);
+            MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[64], Accumulators);
+
+            ABroadcast = wasm_i32x4_shuffle(AElements, wasm_i32x4_splat(0), 3, 3, 3, 3);
+            MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[96], Accumulators);
+
+            a += 4 * 4;
+            B += 4 * 32;
+            k -= 4;
+        }
+
+        while (k > 0) {
+
+            v128_t ABroadcast = wasm_i32x4_splat(*((int32_t*)a));
+            MlasGemmU8X8MultiplyAccumulateRowWasmRelaxedSimd(ABroadcast, &B[0], Accumulators);
+
+            a += 4;
+            B += 32;
+            k -= 1;
+        }
+
+        //
+        // Output the accumulator block after optionally accumulating the values
+        // from matrix C.
+        //
+
+        if (CountN >= 8) {
+
+            if (!ZeroMode) {
+                Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load(&C[0]));
+                Accumulators[1] = wasm_i32x4_add(Accumulators[1], wasm_v128_load(&C[4]));
+            }
+
+            wasm_v128_store(&C[0], Accumulators[0]);
+            wasm_v128_store(&C[4], Accumulators[1]);
+
+            C += 8;
+            CountN -= 8;
+
+        }
+        else {
+
+            //
+            // Output the remaining partial output block.
+            //
+
+            if ((CountN & 4) != 0) {
+
+                if (!ZeroMode) {
+                    Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load(&C[0]));
+                }
+
+                wasm_v128_store(&C[0], Accumulators[0]);
+                C += 4;
+
+                Accumulators[0] = Accumulators[1];
+            }
+
+            if ((CountN & 2) != 0) {
+
+                if (!ZeroMode) {
+                    Accumulators[0] = wasm_i32x4_add(Accumulators[0], wasm_v128_load64_zero(&C[0]));
+                }
+
+                wasm_v128_store64_lane(&C[0], Accumulators[0], 0);
+                C += 2;
+
+                Accumulators[0] = wasm_i32x4_shuffle(Accumulators[0], wasm_i32x4_splat(0), 2, 3, 2, 3);
+            }
+
+            if ((CountN & 1) != 0) {
+
+                int32_t AccumulatorValue = wasm_i32x4_extract_lane(Accumulators[0], 0);
+
+                if (!ZeroMode) {
+                    AccumulatorValue += C[0];
+                }
+
+                C[0] = AccumulatorValue;
+            }
+
+            CountN = 0;
+        }
+    }
+
+    return 1;
+}
+
+const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmRelaxedSimd = {
+    MlasGemmQuantOperation<MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD>,
+    nullptr,
+    nullptr,
+    MLAS_GEMM_U8X8_KERNEL_WASMRELAXEDSIMD::PackedK,
+    0,
+    4 // multiple of kernel stride M
+};

From b0857c49d100e1aeb13115c9d6bee77df50c1eea Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Tue, 5 Nov 2024 14:06:42 +0800
Subject: [PATCH 3/8] Use env.wasm.relaxedSimd to select relaxed SIMD wasm

Now we don't need the file renaming trick.
---
 js/common/lib/env.ts                 |  9 +++++++
 js/web/lib/backend-wasm.ts           |  4 ++++
 js/web/lib/wasm/wasm-factory.ts      | 36 +++++++++++++++++++++++++++-
 js/web/lib/wasm/wasm-utils-import.ts |  9 +++++--
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index d6d9f7fa48790..1a3c9556d243c 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -52,6 +52,15 @@ export declare namespace Env {
      */
     simd?: boolean;
 
+    /**
+     * set or get a boolean value indicating whether to enable Relaxed SIMD. If set to false, Relaxed SIMD will be forcely disabled.
+     *
+     * This setting is available only when WebAssembly Relaxed SIMD feature is available in current context.
+     *
+     * @defaultValue `false`
+     */
+    relaxedSimd?: boolean;
+
     /**
      * set or get a boolean value indicating whether to enable trace.
      *
diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index 72b51d565896a..fa687f5a0e88a 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -25,6 +25,10 @@ export const initializeFlags = (): void => {
     );
   }
 
+  if (typeof env.wasm.relaxedSimd !== 'boolean') {
+    env.wasm.relaxedSimd = false;
+  }
+
   if (typeof env.wasm.proxy !== 'boolean') {
     env.wasm.proxy = false;
   }
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 0f49d25040409..e9743c7a3f308 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -64,6 +64,32 @@ const isSimdSupported = (): boolean => {
   }
 };
 
+const isRelaxedSimdSupported = (): boolean => {
+  try {
+    // Test for WebAssembly Relaxed SIMD capability (for both browsers and Node.js)
+    // This typed array is a WebAssembly program containing Relaxed SIMD instructions.
+
+    // The binary data is generated from the following code by wat2wasm:
+    // (module
+    //   (func (result v128)
+    //      i32.const 1
+    //      i8x16.splat
+    //      i32.const 2
+    //      i8x16.splat
+    //      i8x16.relaxed_swizzle
+    //   )
+    //  )
+    return WebAssembly.validate(
+      new Uint8Array([
+        0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 15, 1, 13, 0, 65, 1, 253, 15, 65, 2, 253,
+        15, 253, 128, 2, 11,
+      ]),
+    );
+  } catch (e) {
+    return false;
+  }
+};
+
 export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promise<void> => {
   if (initialized) {
     return Promise.resolve();
@@ -80,11 +106,14 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis
   // wasm flags are already initialized
   const timeout = flags.initTimeout!;
   let numThreads = flags.numThreads!;
+  const relaxedSimd = flags.relaxedSimd!;
 
   // ensure SIMD is supported
   if (!isSimdSupported()) {
     throw new Error('WebAssembly SIMD is not supported in the current environment.');
   }
+  // check if use relaxed simd
+  const useRelaxedSimd = relaxedSimd && isRelaxedSimdSupported();
 
   // check if multi-threading is supported
   const multiThreadSupported = isMultiThreadSupported();
@@ -116,7 +145,12 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis
   const wasmPathOverride = (wasmPathOverrideFlag as URL)?.href ?? wasmPathOverrideFlag;
   const wasmBinaryOverride = flags.wasmBinary;
 
-  const [objectUrl, ortWasmFactory] = await importWasmModule(mjsPathOverride, wasmPrefixOverride, numThreads > 1);
+  const [objectUrl, ortWasmFactory] = await importWasmModule(
+    mjsPathOverride,
+    wasmPrefixOverride,
+    numThreads > 1,
+    useRelaxedSimd,
+  );
 
   let isTimeout = false;
 
diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts
index a8e27f6f334bc..090ff9781b65f 100644
--- a/js/web/lib/wasm/wasm-utils-import.ts
+++ b/js/web/lib/wasm/wasm-utils-import.ts
@@ -234,13 +234,18 @@ export const importWasmModule = async (
   urlOverride: string | undefined,
   prefixOverride: string | undefined,
   isMultiThreaded: boolean,
+  isRelaxedSimd: boolean,
 ): Promise<[undefined | string, EmscriptenModuleFactory<OrtWasmModule>]> => {
   if (!urlOverride && !prefixOverride && embeddedWasmModule && scriptSrc && isSameOrigin(scriptSrc)) {
     return [undefined, embeddedWasmModule];
   } else {
     const wasmModuleFilename = !BUILD_DEFS.DISABLE_JSEP
-      ? 'ort-wasm-simd-threaded.jsep.mjs'
-      : 'ort-wasm-simd-threaded.mjs';
+      ? isRelaxedSimd
+        ? 'ort-wasm-relaxedsimd-threaded.jsep.mjs'
+        : 'ort-wasm-simd-threaded.jsep.mjs'
+      : isRelaxedSimd
+        ? 'ort-wasm-relaxedsimd-threaded.mjs'
+        : 'ort-wasm-simd-threaded.mjs';
     const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride);
     // need to preload if all of the following conditions are met:
     // 1. not in Node.js.

From 1e5e01f82dbaf53c392fc09ecb64d41766fc2bff Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Mon, 11 Nov 2024 13:35:09 +0800
Subject: [PATCH 4/8] Update unittests and npm test

unittest for relaxed SIMD needs node v21 or newer.
Add wasm.relaxedSimd to test-runner-cli-args
---
 cmake/onnxruntime_unittests.cmake     | 6 +++++-
 js/web/script/test-runner-cli-args.ts | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 87aee2a174fab..b603ad38d7acc 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -222,8 +222,12 @@ function(AddTest)
       else()
         set(TEST_NODE_FLAGS)
 
+        if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
+          message(WARNING "Use system `node` to test Wasm relaxed SIMD. Please make sure to install node v21 or newer.")
+          set(NODE_EXECUTABLE node)
+          set(TEST_NODE_FLAGS)
         # prefer Node from emsdk so the version is more deterministic
-        if (DEFINED ENV{EMSDK_NODE})
+        elseif (DEFINED ENV{EMSDK_NODE})
           set(NODE_EXECUTABLE $ENV{EMSDK_NODE})
         else()
           message(WARNING "EMSDK_NODE environment variable was not set. Falling back to system `node`.")
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index 088a66b24f7bd..aa395f9db5658 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -305,6 +305,10 @@ function parseWasmFlags(args: minimist.ParsedArgs): Env.WebAssemblyFlags {
   if (typeof simd !== 'undefined' && typeof simd !== 'boolean') {
     throw new Error('Flag "wasm.simd"/"wasm-enable-simd" must be a boolean value');
   }
+  const relaxedSimd = (wasm.relaxedSimd = parseBooleanArg(wasm.relaxedSimd));
+  if (typeof relaxedSimd !== 'undefined' && typeof relaxedSimd !== 'boolean') {
+    throw new Error('Flag "wasm.relaxedSimd" must be a boolean value');
+  }
   const proxy = (wasm.proxy = parseBooleanArg(wasm.proxy ?? args['wasm-enable-proxy']));
   if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') {
     throw new Error('Flag "wasm.proxy"/"wasm-enable-proxy" must be a boolean value');

From b3703e771793c025e910e38c1c4afd8c78cc9238 Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Mon, 11 Nov 2024 13:56:34 +0800
Subject: [PATCH 5/8] Formatting

---
 tools/ci_build/build.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 42f208315c627..d32d287fc9eb3 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1423,7 +1423,9 @@ def generate_build_tree(
         cmake_args.append("-Donnxruntime_DNNL_ACL_ROOT=" + args.dnnl_acl_root)
     if args.build_wasm:
         cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_SIMD=" + ("ON" if args.enable_wasm_simd else "OFF"))
-        cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF"))
+        cmake_args.append(
+            "-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF")
+        )
     if args.use_migraphx:
         cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home)
     if args.use_rocm:

From 6ac7ca20bc8f89d1e76fc245c1b8da62540d4666 Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Mon, 2 Dec 2024 15:50:18 +0800
Subject: [PATCH 6/8] Add build flag check

Raise error when enable_wasm_simd is False but enable_wasm_relaxed_simd is True.
Fix comments.
---
 cmake/CMakeLists.txt                                     | 1 +
 cmake/onnxruntime_mlas.cmake                             | 2 +-
 .../core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp       | 4 ++--
 tools/ci_build/build.py                                  | 9 ++++++---
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 0b32d51884c05..57c5be5c59515 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -196,6 +196,7 @@ option(onnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO "Enable this option to turn on
 option(onnxruntime_ENABLE_WEBASSEMBLY_PROFILING "Enable this option to turn on WebAssembly profiling and preserve function names" OFF)
 option(onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL "Enable this option to allow WebAssembly to output optimized model" OFF)
 option(onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64 "Enable this option to allow WebAssembly to use 64bit memory" OFF)
+option(onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD "Enable WebAssembly Relaxed SIMD" OFF)
 
 # Enable bitcode for iOS
 option(onnxruntime_ENABLE_BITCODE "Enable bitcode for iOS only" OFF)
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 714b14374f0fe..e214bb71f0b03 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -252,7 +252,7 @@ function(setup_mlas_source_for_windows)
 endfunction()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-  if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD OR onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
+  if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
     file(GLOB_RECURSE mlas_platform_srcs
       "${MLAS_SRC_DIR}/wasm_simd/*.cpp"
     )
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
index 2c398eadead54..be0f63f6240fd 100644
--- a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
@@ -6,11 +6,11 @@ Licensed under the MIT License.
 
 Module Name:
 
-    qgemm_kernel_wasmsimd.cpp
+    qgemm_kernel_wasmrelaxedsimd.cpp
 
 Abstract:
 
-    This module implements QGEMM kernel for WebAssembly SIMD128.
+    This module implements QGEMM kernel for WebAssembly Relaxed SIMD128.
 
 --*/
 
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index d32d287fc9eb3..3899767e1a2ee 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1423,9 +1423,12 @@ def generate_build_tree(
         cmake_args.append("-Donnxruntime_DNNL_ACL_ROOT=" + args.dnnl_acl_root)
     if args.build_wasm:
         cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_SIMD=" + ("ON" if args.enable_wasm_simd else "OFF"))
-        cmake_args.append(
-            "-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=" + ("ON" if args.enable_wasm_relaxed_simd else "OFF")
-        )
+        if args.enable_wasm_relaxed_simd:
+            if not args.enable_wasm_simd:
+                raise BuildError(
+                    "Wasm Relaxed SIMD (--enable_wasm_relaxed_simd) is only available with Wasm SIMD (--enable_wasm_simd)."
+                )
+            cmake_args += ["-Donnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD=ON"]
     if args.use_migraphx:
         cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home)
     if args.use_rocm:

From 6b4e0739938fd1e8c2185b85a9f13ddd50508152 Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Mon, 2 Dec 2024 16:15:38 +0800
Subject: [PATCH 7/8] Check i32x4.relaxed_dot_i8x16_i7x16_add_s exactly

---
 js/web/lib/wasm/wasm-factory.ts | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index e9743c7a3f308..5d872e2037ee8 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -76,13 +76,15 @@ const isRelaxedSimdSupported = (): boolean => {
     //      i8x16.splat
     //      i32.const 2
     //      i8x16.splat
-    //      i8x16.relaxed_swizzle
+    //      i32.const 3
+    //      i8x16.splat
+    //      i32x4.relaxed_dot_i8x16_i7x16_add_s
     //   )
     //  )
     return WebAssembly.validate(
       new Uint8Array([
-        0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 15, 1, 13, 0, 65, 1, 253, 15, 65, 2, 253,
-        15, 253, 128, 2, 11,
+        0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 19, 1, 17, 0, 65, 1, 253, 15, 65, 2, 253,
+        15, 65, 3, 253, 15, 253, 147, 2, 11,
       ]),
     );
   } catch (e) {

From 2e222c8c90b2ce125625de5f40f4920218388201 Mon Sep 17 00:00:00 2001
From: "jing.bao" <jing.bao@intel.com>
Date: Tue, 11 Mar 2025 15:07:31 +0800
Subject: [PATCH 8/8] keep API unchanged.

Drive-by:
Rebase
Fix comments
Remove unused wasm functions
---
 cmake/onnxruntime_unittests.cmake             |  1 -
 js/common/lib/env.ts                          |  9 -----
 js/web/lib/backend-wasm.ts                    |  4 --
 js/web/lib/wasm/wasm-factory.ts               | 38 +------------------
 js/web/lib/wasm/wasm-utils-import.ts          |  9 +----
 js/web/script/test-runner-cli-args.ts         |  4 --
 .../mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp |  7 +---
 7 files changed, 4 insertions(+), 68 deletions(-)

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index b603ad38d7acc..2e2fb6d858dfc 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -225,7 +225,6 @@ function(AddTest)
         if (onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
           message(WARNING "Use system `node` to test Wasm relaxed SIMD. Please make sure to install node v21 or newer.")
           set(NODE_EXECUTABLE node)
-          set(TEST_NODE_FLAGS)
         # prefer Node from emsdk so the version is more deterministic
         elseif (DEFINED ENV{EMSDK_NODE})
           set(NODE_EXECUTABLE $ENV{EMSDK_NODE})
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index 1a3c9556d243c..d6d9f7fa48790 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -52,15 +52,6 @@ export declare namespace Env {
      */
     simd?: boolean;
 
-    /**
-     * set or get a boolean value indicating whether to enable Relaxed SIMD. If set to false, Relaxed SIMD will be forcely disabled.
-     *
-     * This setting is available only when WebAssembly Relaxed SIMD feature is available in current context.
-     *
-     * @defaultValue `false`
-     */
-    relaxedSimd?: boolean;
-
     /**
      * set or get a boolean value indicating whether to enable trace.
      *
diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index fa687f5a0e88a..72b51d565896a 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -25,10 +25,6 @@ export const initializeFlags = (): void => {
     );
   }
 
-  if (typeof env.wasm.relaxedSimd !== 'boolean') {
-    env.wasm.relaxedSimd = false;
-  }
-
   if (typeof env.wasm.proxy !== 'boolean') {
     env.wasm.proxy = false;
   }
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 5d872e2037ee8..0f49d25040409 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -64,34 +64,6 @@ const isSimdSupported = (): boolean => {
   }
 };
 
-const isRelaxedSimdSupported = (): boolean => {
-  try {
-    // Test for WebAssembly Relaxed SIMD capability (for both browsers and Node.js)
-    // This typed array is a WebAssembly program containing Relaxed SIMD instructions.
-
-    // The binary data is generated from the following code by wat2wasm:
-    // (module
-    //   (func (result v128)
-    //      i32.const 1
-    //      i8x16.splat
-    //      i32.const 2
-    //      i8x16.splat
-    //      i32.const 3
-    //      i8x16.splat
-    //      i32x4.relaxed_dot_i8x16_i7x16_add_s
-    //   )
-    //  )
-    return WebAssembly.validate(
-      new Uint8Array([
-        0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 19, 1, 17, 0, 65, 1, 253, 15, 65, 2, 253,
-        15, 65, 3, 253, 15, 253, 147, 2, 11,
-      ]),
-    );
-  } catch (e) {
-    return false;
-  }
-};
-
 export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promise<void> => {
   if (initialized) {
     return Promise.resolve();
@@ -108,14 +80,11 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis
   // wasm flags are already initialized
   const timeout = flags.initTimeout!;
   let numThreads = flags.numThreads!;
-  const relaxedSimd = flags.relaxedSimd!;
 
   // ensure SIMD is supported
   if (!isSimdSupported()) {
     throw new Error('WebAssembly SIMD is not supported in the current environment.');
   }
-  // check if use relaxed simd
-  const useRelaxedSimd = relaxedSimd && isRelaxedSimdSupported();
 
   // check if multi-threading is supported
   const multiThreadSupported = isMultiThreadSupported();
@@ -147,12 +116,7 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis
   const wasmPathOverride = (wasmPathOverrideFlag as URL)?.href ?? wasmPathOverrideFlag;
   const wasmBinaryOverride = flags.wasmBinary;
 
-  const [objectUrl, ortWasmFactory] = await importWasmModule(
-    mjsPathOverride,
-    wasmPrefixOverride,
-    numThreads > 1,
-    useRelaxedSimd,
-  );
+  const [objectUrl, ortWasmFactory] = await importWasmModule(mjsPathOverride, wasmPrefixOverride, numThreads > 1);
 
   let isTimeout = false;
 
diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts
index 090ff9781b65f..a8e27f6f334bc 100644
--- a/js/web/lib/wasm/wasm-utils-import.ts
+++ b/js/web/lib/wasm/wasm-utils-import.ts
@@ -234,18 +234,13 @@ export const importWasmModule = async (
   urlOverride: string | undefined,
   prefixOverride: string | undefined,
   isMultiThreaded: boolean,
-  isRelaxedSimd: boolean,
 ): Promise<[undefined | string, EmscriptenModuleFactory<OrtWasmModule>]> => {
   if (!urlOverride && !prefixOverride && embeddedWasmModule && scriptSrc && isSameOrigin(scriptSrc)) {
     return [undefined, embeddedWasmModule];
   } else {
     const wasmModuleFilename = !BUILD_DEFS.DISABLE_JSEP
-      ? isRelaxedSimd
-        ? 'ort-wasm-relaxedsimd-threaded.jsep.mjs'
-        : 'ort-wasm-simd-threaded.jsep.mjs'
-      : isRelaxedSimd
-        ? 'ort-wasm-relaxedsimd-threaded.mjs'
-        : 'ort-wasm-simd-threaded.mjs';
+      ? 'ort-wasm-simd-threaded.jsep.mjs'
+      : 'ort-wasm-simd-threaded.mjs';
     const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride);
     // need to preload if all of the following conditions are met:
     // 1. not in Node.js.
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index aa395f9db5658..088a66b24f7bd 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -305,10 +305,6 @@ function parseWasmFlags(args: minimist.ParsedArgs): Env.WebAssemblyFlags {
   if (typeof simd !== 'undefined' && typeof simd !== 'boolean') {
     throw new Error('Flag "wasm.simd"/"wasm-enable-simd" must be a boolean value');
   }
-  const relaxedSimd = (wasm.relaxedSimd = parseBooleanArg(wasm.relaxedSimd));
-  if (typeof relaxedSimd !== 'undefined' && typeof relaxedSimd !== 'boolean') {
-    throw new Error('Flag "wasm.relaxedSimd" must be a boolean value');
-  }
   const proxy = (wasm.proxy = parseBooleanArg(wasm.proxy ?? args['wasm-enable-proxy']));
   if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') {
     throw new Error('Flag "wasm.proxy"/"wasm-enable-proxy" must be a boolean value');
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
index be0f63f6240fd..a3a0fa758d377 100644
--- a/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_wasmrelaxedsimd.cpp
@@ -18,7 +18,7 @@ Module Name:
 #include "qgemm.h"
 
 bool HasUSDot() {
-// Check out-of-bounds behaviour of Relaxed Integer Dot Product with Accumulation with signed and unsigned input (e.g. vpdpbusd).
+// Check out-of-bounds behavior of Relaxed Integer Dot Product with Accumulation with signed and unsigned input (e.g. vpdpbusd).
       const v128_t int8_input = wasm_i8x16_const(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0);
       const volatile v128_t xint8_input = wasm_i8x16_const(0, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, -128, 0, 0, 0);  // volatile to confuse Clang which otherwise ICE's
       const v128_t xint8_output = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(int8_input, xint8_input, wasm_i8x16_const_splat(0));
@@ -35,11 +35,6 @@ v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpacklo_relax
     return wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
 }
 
-// wasm implementation of "_mm_unpackhi_epi8"
-v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i8x16_unpackhi_relaxed(v128_t a, v128_t b) {
-    return wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-}
-
 // wasm implementation of "_mm_unpacklo_epi16"
 v128_t __attribute__((__always_inline__, __nodebug__)) wasm_i16x8_unpacklo_relaxed(v128_t a, v128_t b) {
     return wasm_i8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);