turn on neural_speed by default (microsoft#19627)

yufenglee · web-flow · commit 15219e2e71b8 · 2024-03-20T12:49:58.000-07:00
### Description
&lt;!-- Describe your changes. --&gt;
the crash caused by the neural_speed turns out to be a very corn case.
Turn it on by default.


### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
@@ -206,7 +206,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "c11386eb632eec7c1c2aa323142f73519f946e2a",
+          "commitHash": "150e7527d5286ddd3a995c228dedf8d76a7a86bc",
           "repositoryUrl": "https://github.com/intel/neural-speed.git"
         },
         "comments": "neural_speed"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" OFF)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -1206,7 +1206,7 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD)
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_TVM)
   include(neural_speed)
   if (USE_NEURAL_SPEED)
     list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -35,7 +35,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip;65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939
+neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
 #use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
@@ -9,6 +9,7 @@ if(USE_NEURAL_SPEED)
       neural_speed
       URL ${DEP_URL_neural_speed}
       URL_HASH SHA1=${DEP_SHA1_neural_speed}
+      PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
   )
   set(BTLA_USE_OPENMP OFF)
   onnxruntime_fetchcontent_makeavailable(neural_speed)
diff --git a/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
@@ -0,0 +1,30 @@
+diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
+index 99f3ccc..a11de9d 100644
+--- a/bestla/bestla/bestla_prologue_b.h
++++ b/bestla/bestla/bestla_prologue_b.h
+@@ -456,9 +456,8 @@ class WeightKBlockNInteger {
+     auto tmpscales = tmp;
+     auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
+     if (scales) {
+-      for (size_t i = 0; i < N * blks; i += 2) {
++      for (size_t i = 0; i < N * blks; i ++) {
+         tmpscales[i] = scales[i] / 16;
+-        tmpscales[i + 1] = scales[i + 1] / 16;
+       }
+     }
+     if (zero_points) {
+diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h
+index 6783ee8..59822e5 100644
+--- a/bestla/bestla/kernel_avx512f.h
++++ b/bestla/bestla/kernel_avx512f.h
+@@ -673,8 +673,8 @@ inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8
+     zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift);  // int3_clip => int8
+     zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift);  // int3_clip => int8
+
+-    _mm512_storeu_epi8((__m512i*)dst, zmm1);
+-    _mm512_storeu_epi8((__m512i*)(dst + 64), zmm2);
++    _mm512_storeu_si512((__m512i*)dst, zmm1);
++    _mm512_storeu_si512((__m512i*)(dst + 64), zmm2);
+   };
+
+   assert(head_ignore_num % 8 == 0);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
@@ -27,6 +27,7 @@
 #pragma warning(disable : 4244)
 #pragma warning(disable : 4267)
 #pragma warning(disable : 4702)
+#pragma warning(disable : 4127)
 #endif
 
 #include "bestla/bestla_prologue_a.h"
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.143
+      version: 1.0.145
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.143
+      version: 1.0.145
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ if(USE_NEURAL_SPEED)`
`9`	`9`	`neural_speed`
`10`	`10`	`URL ${DEP_URL_neural_speed}`
`11`	`11`	`URL_HASH SHA1=${DEP_SHA1_neural_speed}`
	`12`	`+ PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch`
`12`	`13`	`)`
`13`	`14`	`set(BTLA_USE_OPENMP OFF)`
`14`	`15`	`onnxruntime_fetchcontent_makeavailable(neural_speed)`