microsoft
diff --git a/‎.github/workflows/gradle-wrapper-validation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/gradle-wrapper-validation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 15 additions & 9 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎cmake/onnxruntime_test_pch.cmake‎
Lines changed: 5 additions & 3 deletions b/‎cmake/onnxruntime_test_pch.cmake‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 8 additions & 9 deletions b/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎js/node/src/session_options_helper.cc‎
Lines changed: 30 additions & 5 deletions b/‎js/node/src/session_options_helper.cc‎
Lines changed: 30 additions & 5 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl‎
Lines changed: 2 additions & 1 deletion b/‎onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h‎
Lines changed: 5 additions & 5 deletions b/‎onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc‎
Lines changed: 2 additions & 1 deletion b/‎onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc‎
Lines changed: 2 additions & 1 deletion
@@ -16,7 +16,7 @@ jobs:
     runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
     steps:
       - uses: actions/checkout@v5
-      - uses: gradle/actions/wrapper-validation@v4
+      - uses: gradle/actions/wrapper-validation@v5
 concurrency:
   group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
@@ -101,7 +101,7 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
 cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
-cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF)
+option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF)
 
 option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
 option(onnxruntime_USE_AVX "Use AVX instructions" OFF)
 
@@ -5,6 +5,9 @@ set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
 set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
 set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
 
+# mlas_private_compile_definitions contains compile definitions that are private to onnxruntime_mlas and targets which
+# use internal MLAS headers like mlasi.h.
+set(mlas_private_compile_definitions)
 #
 # All hardware agnostic source files here
 # hardware specific files would cause trouble in
@@ -133,9 +136,9 @@ function(setup_mlas_source_for_windows)
       )
 
       if (onnxruntime_USE_ARM_NEON_NCHWC)
-		setup_arm_neon_nchwc()	
+		setup_arm_neon_nchwc()
 	  endif()
-      
+
 	  if (onnxruntime_USE_KLEIDIAI)
         setup_kleidiai()
       endif()
@@ -293,11 +296,12 @@ endfunction()
 
 function (setup_arm_neon_nchwc)
   target_sources(onnxruntime_mlas PRIVATE
-   ${MLAS_SRC_DIR}/sconv.h  
+   ${MLAS_SRC_DIR}/sconv.h
    ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
    ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
   )
-  target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NEON_NCHWC)
+  list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC)
+  set(mlas_private_compile_definitions ${mlas_private_compile_definitions} PARENT_SCOPE)
 endfunction ()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
@@ -445,25 +449,25 @@ else()
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
         )
-        
+
         # Conditionally add the SVE implementation if compiler supports it
         if (onnxruntime_USE_SVE)
           list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/mlasi_sve.h)
           list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/elementwise_sve.cpp)
           set_source_files_properties(${MLAS_SRC_DIR}/sve/elementwise_sve.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+sve+fp16 ")
-          target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE)
+          list(APPEND mlas_private_compile_definitions MLAS_USE_SVE)
         endif()
 
         if (onnxruntime_USE_ARM_NEON_NCHWC)
-		  setup_arm_neon_nchwc()	
+		  setup_arm_neon_nchwc()
 		endif()
-        
+
 		if (onnxruntime_USE_KLEIDIAI)
           setup_kleidiai()
         endif()
         set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
                                     PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
-        set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp 
+        set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
 				    PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
 
         if (NOT APPLE)
@@ -806,6 +810,8 @@ foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
 
+    target_compile_definitions(${mlas_target} PRIVATE ${mlas_private_compile_definitions})
+
     set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
 endforeach()
 
 
@@ -5,9 +5,11 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
   target_precompile_headers(onnxruntime_test_all PRIVATE
     "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
   )
-  target_precompile_headers(onnxruntime_provider_test PRIVATE
-    "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
-  )
+  if (TARGET onnxruntime_provider_test)
+    target_precompile_headers(onnxruntime_provider_test PRIVATE
+      "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
+    )
+  endif()
 endif()
 
 # Exclude certain files that might conflict with PCH
 
@@ -1228,6 +1228,11 @@ block()
     LIBS ${onnxruntime_provider_test_libs}
     DEPENDS ${onnxruntime_provider_test_deps}
   )
+  if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
+    # The test_main.cc includes NvInfer.h where it has many deprecated declarations
+    # simply ignore them for TensorRT EP build
+    set_property(TARGET onnxruntime_provider_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
+  endif()
 
   # enable dynamic plugin EP usage
   target_compile_definitions(onnxruntime_provider_test PRIVATE ORT_UNIT_TEST_ENABLE_DYNAMIC_PLUGIN_EP_USAGE)
@@ -1325,9 +1330,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       ${BENCHMARK_DIR}/layer_normalization.cc)
     target_include_directories(onnxruntime_benchmark PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_graph_header} ${ONNXRUNTIME_ROOT}/core/mlas/inc)
     target_compile_definitions(onnxruntime_benchmark PRIVATE BENCHMARK_STATIC_DEFINE)
-    if (onnxruntime_USE_SVE)
-      target_compile_definitions(onnxruntime_benchmark PRIVATE MLAS_USE_SVE)
-    endif()
+    target_compile_definitions(onnxruntime_benchmark PRIVATE ${mlas_private_compile_definitions})
     if(WIN32)
       target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4141>"
                         "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4141>")
@@ -1355,9 +1358,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
     target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
     target_compile_definitions(onnxruntime_mlas_benchmark PRIVATE BENCHMARK_STATIC_DEFINE)
-    if (onnxruntime_USE_SVE)
-      target_compile_definitions(onnxruntime_mlas_benchmark PRIVATE MLAS_USE_SVE)
-    endif()
+    target_compile_definitions(onnxruntime_mlas_benchmark PRIVATE ${mlas_private_compile_definitions})
     if(WIN32)
       target_link_libraries(onnxruntime_mlas_benchmark PRIVATE debug Dbghelp)
       # Avoid using new and delete. But this is a benchmark program, it's ok if it has a chance to leak.
@@ -1655,9 +1656,7 @@ endif()
         XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
       )
     endif()
-    if (onnxruntime_USE_SVE)
-      target_compile_definitions(onnxruntime_mlas_test PRIVATE MLAS_USE_SVE)
-    endif()
+    target_compile_definitions(onnxruntime_mlas_test PRIVATE ${mlas_private_compile_definitions})
     target_include_directories(onnxruntime_mlas_test PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${ONNXRUNTIME_ROOT}
             ${CMAKE_CURRENT_BINARY_DIR})
     target_link_libraries(onnxruntime_mlas_test PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
 
@@ -73,12 +73,37 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions& sess
         for (const auto& nameIter : obj.GetPropertyNames()) {
           Napi::Value nameVar = nameIter.second;
           std::string name = nameVar.As<Napi::String>().Utf8Value();
-          if (name != "name") {
-            Napi::Value valueVar = obj.Get(nameVar);
-            ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(), "Invalid argument: sessionOptions.executionProviders must be a string or an object with property 'name'.");
-            std::string value = valueVar.As<Napi::String>().Utf8Value();
-            webgpu_options[name] = value;
+          Napi::Value valueVar = obj.Get(nameVar);
+          std::string value;
+          if (name == "preferredLayout" ||
+              name == "validationMode" ||
+              name == "storageBufferCacheMode" ||
+              name == "uniformBufferCacheMode" ||
+              name == "queryResolveBufferCacheMode" ||
+              name == "defaultBufferCacheMode") {
+            ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(),
+                                        "Invalid argument: \"", name, "\" must be a string.");
+            value = valueVar.As<Napi::String>().Utf8Value();
+          } else if (name == "forceCpuNodeNames") {
+            ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsArray(), epList.Env(),
+                                        "Invalid argument: \"forceCpuNodeNames\" must be a string array.");
+            auto arr = valueVar.As<Napi::Array>();
+            for (uint32_t i = 0; i < arr.Length(); i++) {
+              Napi::Value v = arr[i];
+              ORT_NAPI_THROW_TYPEERROR_IF(!v.IsString(), epList.Env(),
+                                          "Invalid argument: elements of \"forceCpuNodeNames\" must be strings.");
+              if (i > 0) {
+                value += '\n';
+              }
+              value += v.As<Napi::String>().Utf8Value();
+            }
+          } else {
+            // unrecognized option
+            ORT_NAPI_THROW_TYPEERROR_IF(name != "name", epList.Env(),
+                                        "Invalid argument: WebGPU EP has an unrecognized option: '", name, "'.");
+            continue;
           }
+          webgpu_options[name] = value;
         }
       }
 #endif
 
@@ -60,7 +60,7 @@ namespace cutlass_kernels {
 template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType, typename OutputType,
           cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag, typename CTAShape, typename ClusterShape,
           typename MainloopScheduleType, typename EpilogueScheduleType>
-#ifdef COMPILE_HOPPER_TMA_GEMMS
+#if defined(COMPILE_HOPPER_TMA_GEMMS) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 900) && defined(__NV_SASS_VERSION__)
 void sm90_generic_mixed_gemm_kernelLauncher(
     ActivationType const* A, WeightType const* B,
     ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases,
@@ -269,6 +269,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(
   }
 }
 #else   // COMPILE_HOPPER_TMA_GEMMS
+// This stub is now used for ALL non-SASS or non-SM90A compilation passes includes the 90-virtual (PTX) pass.
 void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const*, WeightType const*,
                                             ScaleZeroType const*, ScaleZeroType const*, BiasType const*,
                                             float const, OutputType*, int, int, int, int const, tkc::CutlassGemmConfig,
 
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm_profiler.h"
 #include "contrib_ops/cuda/llm/common/workspace.h"
 
@@ -97,3 +98,4 @@ bool WeightOnlyGroupwiseQuantGemmPluginProfiler::checkTactic(int m, int /*n*/, i
 }
 
 }  // namespace onnxruntime::llm::kernels::weight_only
+#endif
@@ -134,11 +134,6 @@ struct TreeEnsembleAttributesV5 {
     for (auto i : nodes_modes_i) {
       nodes_modes.push_back(static_cast<NODE_MODE_ONNX>(i));
     }
-#else
-    // GetVectorAttrsOrDefault is not part of the minimal build.
-    // As a result, TreeEnsemble v5 cannot be available in this build.
-    ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build.");
-#endif
 
     aggregate_function = info.GetAttrOrDefault<int64_t>("aggregate_function", 1);
     leaf_targetids = info.GetAttrsOrDefault<int64_t>("leaf_targetids");
@@ -151,6 +146,11 @@ struct TreeEnsembleAttributesV5 {
     nodes_truenodeids = info.GetAttrsOrDefault<int64_t>("nodes_truenodeids");
     post_transform = info.GetAttrOrDefault<int64_t>("post_transform", 0);
     tree_roots = info.GetAttrsOrDefault<int64_t>("tree_roots");
+#else
+    // GetVectorAttrsOrDefault is not part of the minimal build.
+    // As a result, TreeEnsemble v5 cannot be available in this build.
+    ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build.");
+#endif
   }
 
   void convert_to_v3(TreeEnsembleAttributesV3<ThresholdType>& output) const {
 
@@ -3,7 +3,7 @@
 
 // Test can be run like the following:
 //  ./onnxruntime_provider_test --gtest_filter=CUDA_EP_Unittest.*
-
+#if USE_FPA_INTB_GEMM
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
@@ -620,3 +620,4 @@ TEST_F(Bf16Int4GroupwiseTest, BF16_Int4_Gemm_CudaKernel) {
     }
   }
 }
+#endif
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`
`4`	`4`	`// Test can be run like the following:`
`5`	`5`	`// ./onnxruntime_provider_test --gtest_filter=CUDA_EP_Unittest.*`
`6`		`-`
	`6`	`+#if USE_FPA_INTB_GEMM`
`7`	`7`	`#include <cuda_profiler_api.h>`
`8`	`8`	`#include <cuda_runtime.h>`
`9`	`9`	`#include <gtest/gtest.h>`
`@@ -620,3 +620,4 @@ TEST_F(Bf16Int4GroupwiseTest, BF16_Int4_Gemm_CudaKernel) {`
`620`	`620`	`}`
`621`	`621`	`}`
`622`	`622`	`}`
	`623`	`+#endif`