Skip to content

Commit dc47d84

Browse files
committed
Merged PR 621: Merge oss main with fj-develop
2 parents 1bfec70 + def8a93 commit dc47d84

33 files changed

+183
-272
lines changed

.github/workflows/gradle-wrapper-validation.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
1717
steps:
1818
- uses: actions/checkout@v5
19-
- uses: gradle/actions/wrapper-validation@v4
19+
- uses: gradle/actions/wrapper-validation@v5
2020
concurrency:
2121
group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
2222
cancel-in-progress: true

cmake/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
101101
cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
102102
option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
103103
cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
104-
cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF)
104+
option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF)
105105

106106
option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
107107
option(onnxruntime_USE_AVX "Use AVX instructions" OFF)

cmake/onnxruntime_mlas.cmake

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
55
set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
66
set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
77

8+
# mlas_private_compile_definitions contains compile definitions that are private to onnxruntime_mlas and targets which
9+
# use internal MLAS headers like mlasi.h.
10+
set(mlas_private_compile_definitions)
811
#
912
# All hardware agnostic source files here
1013
# hardware specific files would cause trouble in
@@ -133,9 +136,9 @@ function(setup_mlas_source_for_windows)
133136
)
134137

135138
if (onnxruntime_USE_ARM_NEON_NCHWC)
136-
setup_arm_neon_nchwc()
139+
setup_arm_neon_nchwc()
137140
endif()
138-
141+
139142
if (onnxruntime_USE_KLEIDIAI)
140143
setup_kleidiai()
141144
endif()
@@ -293,11 +296,12 @@ endfunction()
293296

294297
function (setup_arm_neon_nchwc)
295298
target_sources(onnxruntime_mlas PRIVATE
296-
${MLAS_SRC_DIR}/sconv.h
299+
${MLAS_SRC_DIR}/sconv.h
297300
${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
298301
${MLAS_SRC_DIR}/spool_kernel_neon.cpp
299302
)
300-
target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NEON_NCHWC)
303+
list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC)
304+
set(mlas_private_compile_definitions ${mlas_private_compile_definitions} PARENT_SCOPE)
301305
endfunction ()
302306

303307
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
@@ -445,25 +449,25 @@ else()
445449
${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
446450
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
447451
)
448-
452+
449453
# Conditionally add the SVE implementation if compiler supports it
450454
if (onnxruntime_USE_SVE)
451455
list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/mlasi_sve.h)
452456
list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/elementwise_sve.cpp)
453457
set_source_files_properties(${MLAS_SRC_DIR}/sve/elementwise_sve.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+sve+fp16 ")
454-
target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE)
458+
list(APPEND mlas_private_compile_definitions MLAS_USE_SVE)
455459
endif()
456460

457461
if (onnxruntime_USE_ARM_NEON_NCHWC)
458-
setup_arm_neon_nchwc()
462+
setup_arm_neon_nchwc()
459463
endif()
460-
464+
461465
if (onnxruntime_USE_KLEIDIAI)
462466
setup_kleidiai()
463467
endif()
464468
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
465469
PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
466-
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
470+
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
467471
PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
468472

469473
if (NOT APPLE)
@@ -806,6 +810,8 @@ foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
806810
target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
807811
onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
808812

813+
target_compile_definitions(${mlas_target} PRIVATE ${mlas_private_compile_definitions})
814+
809815
set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
810816
endforeach()
811817

cmake/onnxruntime_test_pch.cmake

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
55
target_precompile_headers(onnxruntime_test_all PRIVATE
66
"${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
77
)
8-
target_precompile_headers(onnxruntime_provider_test PRIVATE
9-
"${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
10-
)
8+
if (TARGET onnxruntime_provider_test)
9+
target_precompile_headers(onnxruntime_provider_test PRIVATE
10+
"${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
11+
)
12+
endif()
1113
endif()
1214

1315
# Exclude certain files that might conflict with PCH

cmake/onnxruntime_unittests.cmake

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,6 +1228,11 @@ block()
12281228
LIBS ${onnxruntime_provider_test_libs}
12291229
DEPENDS ${onnxruntime_provider_test_deps}
12301230
)
1231+
if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
1232+
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
1233+
# simply ignore them for TensorRT EP build
1234+
set_property(TARGET onnxruntime_provider_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
1235+
endif()
12311236

12321237
# enable dynamic plugin EP usage
12331238
target_compile_definitions(onnxruntime_provider_test PRIVATE ORT_UNIT_TEST_ENABLE_DYNAMIC_PLUGIN_EP_USAGE)
@@ -1325,9 +1330,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
13251330
${BENCHMARK_DIR}/layer_normalization.cc)
13261331
target_include_directories(onnxruntime_benchmark PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_graph_header} ${ONNXRUNTIME_ROOT}/core/mlas/inc)
13271332
target_compile_definitions(onnxruntime_benchmark PRIVATE BENCHMARK_STATIC_DEFINE)
1328-
if (onnxruntime_USE_SVE)
1329-
target_compile_definitions(onnxruntime_benchmark PRIVATE MLAS_USE_SVE)
1330-
endif()
1333+
target_compile_definitions(onnxruntime_benchmark PRIVATE ${mlas_private_compile_definitions})
13311334
if(WIN32)
13321335
target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4141>"
13331336
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4141>")
@@ -1355,9 +1358,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
13551358
target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
13561359
target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
13571360
target_compile_definitions(onnxruntime_mlas_benchmark PRIVATE BENCHMARK_STATIC_DEFINE)
1358-
if (onnxruntime_USE_SVE)
1359-
target_compile_definitions(onnxruntime_mlas_benchmark PRIVATE MLAS_USE_SVE)
1360-
endif()
1361+
target_compile_definitions(onnxruntime_mlas_benchmark PRIVATE ${mlas_private_compile_definitions})
13611362
if(WIN32)
13621363
target_link_libraries(onnxruntime_mlas_benchmark PRIVATE debug Dbghelp)
13631364
# Avoid using new and delete. But this is a benchmark program, it's ok if it has a chance to leak.
@@ -1655,9 +1656,7 @@ endif()
16551656
XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
16561657
)
16571658
endif()
1658-
if (onnxruntime_USE_SVE)
1659-
target_compile_definitions(onnxruntime_mlas_test PRIVATE MLAS_USE_SVE)
1660-
endif()
1659+
target_compile_definitions(onnxruntime_mlas_test PRIVATE ${mlas_private_compile_definitions})
16611660
target_include_directories(onnxruntime_mlas_test PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${ONNXRUNTIME_ROOT}
16621661
${CMAKE_CURRENT_BINARY_DIR})
16631662
target_link_libraries(onnxruntime_mlas_test PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)

js/node/src/session_options_helper.cc

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,37 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions& sess
7373
for (const auto& nameIter : obj.GetPropertyNames()) {
7474
Napi::Value nameVar = nameIter.second;
7575
std::string name = nameVar.As<Napi::String>().Utf8Value();
76-
if (name != "name") {
77-
Napi::Value valueVar = obj.Get(nameVar);
78-
ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(), "Invalid argument: sessionOptions.executionProviders must be a string or an object with property 'name'.");
79-
std::string value = valueVar.As<Napi::String>().Utf8Value();
80-
webgpu_options[name] = value;
76+
Napi::Value valueVar = obj.Get(nameVar);
77+
std::string value;
78+
if (name == "preferredLayout" ||
79+
name == "validationMode" ||
80+
name == "storageBufferCacheMode" ||
81+
name == "uniformBufferCacheMode" ||
82+
name == "queryResolveBufferCacheMode" ||
83+
name == "defaultBufferCacheMode") {
84+
ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(),
85+
"Invalid argument: \"", name, "\" must be a string.");
86+
value = valueVar.As<Napi::String>().Utf8Value();
87+
} else if (name == "forceCpuNodeNames") {
88+
ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsArray(), epList.Env(),
89+
"Invalid argument: \"forceCpuNodeNames\" must be a string array.");
90+
auto arr = valueVar.As<Napi::Array>();
91+
for (uint32_t i = 0; i < arr.Length(); i++) {
92+
Napi::Value v = arr[i];
93+
ORT_NAPI_THROW_TYPEERROR_IF(!v.IsString(), epList.Env(),
94+
"Invalid argument: elements of \"forceCpuNodeNames\" must be strings.");
95+
if (i > 0) {
96+
value += '\n';
97+
}
98+
value += v.As<Napi::String>().Utf8Value();
99+
}
100+
} else {
101+
// unrecognized option
102+
ORT_NAPI_THROW_TYPEERROR_IF(name != "name", epList.Env(),
103+
"Invalid argument: WebGPU EP has an unrecognized option: '", name, "'.");
104+
continue;
81105
}
106+
webgpu_options[name] = value;
82107
}
83108
}
84109
#endif

onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ namespace cutlass_kernels {
6060
template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType, typename OutputType,
6161
cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag, typename CTAShape, typename ClusterShape,
6262
typename MainloopScheduleType, typename EpilogueScheduleType>
63-
#ifdef COMPILE_HOPPER_TMA_GEMMS
63+
#if defined(COMPILE_HOPPER_TMA_GEMMS) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 900) && defined(__NV_SASS_VERSION__)
6464
void sm90_generic_mixed_gemm_kernelLauncher(
6565
ActivationType const* A, WeightType const* B,
6666
ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases,
@@ -269,6 +269,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(
269269
}
270270
}
271271
#else // COMPILE_HOPPER_TMA_GEMMS
272+
// This stub is now used for ALL non-SASS or non-SM90A compilation passes includes the 90-virtual (PTX) pass.
272273
void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const*, WeightType const*,
273274
ScaleZeroType const*, ScaleZeroType const*, BiasType const*,
274275
float const, OutputType*, int, int, int, int const, tkc::CutlassGemmConfig,

onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17+
#if USE_FPA_INTB_GEMM
1718
#include "contrib_ops/cuda/llm/fpA_intB_gemm_profiler.h"
1819
#include "contrib_ops/cuda/llm/common/workspace.h"
1920

@@ -97,3 +98,4 @@ bool WeightOnlyGroupwiseQuantGemmPluginProfiler::checkTactic(int m, int /*n*/, i
9798
}
9899

99100
} // namespace onnxruntime::llm::kernels::weight_only
101+
#endif

onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,6 @@ struct TreeEnsembleAttributesV5 {
134134
for (auto i : nodes_modes_i) {
135135
nodes_modes.push_back(static_cast<NODE_MODE_ONNX>(i));
136136
}
137-
#else
138-
// GetVectorAttrsOrDefault is not part of the minimal build.
139-
// As a result, TreeEnsemble v5 cannot be available in this build.
140-
ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build.");
141-
#endif
142137

143138
aggregate_function = info.GetAttrOrDefault<int64_t>("aggregate_function", 1);
144139
leaf_targetids = info.GetAttrsOrDefault<int64_t>("leaf_targetids");
@@ -151,6 +146,11 @@ struct TreeEnsembleAttributesV5 {
151146
nodes_truenodeids = info.GetAttrsOrDefault<int64_t>("nodes_truenodeids");
152147
post_transform = info.GetAttrOrDefault<int64_t>("post_transform", 0);
153148
tree_roots = info.GetAttrsOrDefault<int64_t>("tree_roots");
149+
#else
150+
// GetVectorAttrsOrDefault is not part of the minimal build.
151+
// As a result, TreeEnsemble v5 cannot be available in this build.
152+
ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build.");
153+
#endif
154154
}
155155

156156
void convert_to_v3(TreeEnsembleAttributesV3<ThresholdType>& output) const {

onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
// Test can be run like the following:
55
// ./onnxruntime_provider_test --gtest_filter=CUDA_EP_Unittest.*
6-
6+
#if USE_FPA_INTB_GEMM
77
#include <cuda_profiler_api.h>
88
#include <cuda_runtime.h>
99
#include <gtest/gtest.h>
@@ -620,3 +620,4 @@ TEST_F(Bf16Int4GroupwiseTest, BF16_Int4_Gemm_CudaKernel) {
620620
}
621621
}
622622
}
623+
#endif

0 commit comments

Comments
 (0)