microsoft
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/mlasi.h‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/core/mlas/lib/mlasi.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/platform.cpp‎
Lines changed: 4 additions & 1 deletion b/‎onnxruntime/core/mlas/lib/platform.cpp‎
Lines changed: 4 additions & 1 deletion
@@ -777,6 +777,7 @@ endif()
     if(LOONGARCH64 AND MLAS_SOURCE_IS_NOT_SET)
         set(mlas_platform_srcs
           ${MLAS_SRC_DIR}/qgemm_kernel_lsx.cpp
+          ${MLAS_SRC_DIR}/sqnbitgemm_kernel_lasx.cpp
           ${MLAS_SRC_DIR}/loongarch64/SgemmKernelLasx.S
           ${MLAS_SRC_DIR}/loongarch64/DgemmKernelLsx.S
           ${MLAS_SRC_DIR}/loongarch64/DgemmKernelLasx.S
 
@@ -1234,6 +1234,8 @@ extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512;
 
 extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni;
 
+extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchLasx;
+
 //
 // Rotary embedding dispatch structure.
 //
 
@@ -742,6 +742,9 @@ Return Value:
         this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32KernelLasx;
         this->TransposePackB16x4Routine = MlasSgemmTransposePackB16x4Lasx;
 
+        // add new sqn-lasx kernel
+        this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchLasx;
+
         this->GemmU8S8Dispatch = &MlasGemmU8X8DispatchLSX;
         this->GemmU8U8Dispatch = &MlasGemmU8X8DispatchLSX;
     }else if( cap_lsx ){
@@ -824,4 +827,4 @@ thread_local size_t ThreadedBufSize = 0;
 thread_local std::unique_ptr<uint8_t, decltype(&_aligned_free)> ThreadedBufHolder(nullptr, &_aligned_free);
 #else
 thread_local std::unique_ptr<uint8_t, decltype(&free)> ThreadedBufHolder(nullptr, &free);
-#endif
+#endif